mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-03 12:40:45 +08:00
Merge branch 'main' into bugfix-null-key
This commit is contained in:
@@ -21,6 +21,7 @@ from .login import DouYinLogin
|
||||
class DouYinCrawler(AbstractCrawler):
|
||||
platform: str
|
||||
login_type: str
|
||||
crawler_type: str
|
||||
context_page: Page
|
||||
dy_client: DOUYINClient
|
||||
account_pool: AccountPool
|
||||
@@ -30,10 +31,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||
self.index_url = "https://www.douyin.com"
|
||||
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.account_pool = account_pool
|
||||
self.crawler_type = crawler_type
|
||||
|
||||
async def start(self) -> None:
|
||||
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
||||
@@ -63,8 +65,12 @@ class DouYinCrawler(AbstractCrawler):
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
# search_posts
|
||||
await self.search()
|
||||
if self.crawler_type == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif self.crawler_type == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
|
||||
utils.logger.info("Douyin Crawler finished ...")
|
||||
|
||||
@@ -74,7 +80,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
request_keyword_var.set(keyword)
|
||||
utils.logger.info(f"Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
dy_limit_count = 10 # douyin fixed limit page 10
|
||||
dy_limit_count = 10
|
||||
page = 0
|
||||
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
try:
|
||||
@@ -95,6 +101,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
# todo douyin support
|
||||
pass
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
task_list: List[Task] = []
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
|
||||
@@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler
|
||||
from base.proxy_account_pool import AccountPool
|
||||
from models import xiaohongshu as xhs_model
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
from var import crawler_type_var
|
||||
|
||||
from .client import XHSClient
|
||||
from .exception import DataFetchError
|
||||
@@ -22,6 +22,7 @@ from .login import XHSLogin
|
||||
class XiaoHongShuCrawler(AbstractCrawler):
|
||||
platform: str
|
||||
login_type: str
|
||||
crawler_type: str
|
||||
context_page: Page
|
||||
xhs_client: XHSClient
|
||||
account_pool: AccountPool
|
||||
@@ -31,10 +32,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
self.index_url = "https://www.xiaohongshu.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.account_pool = account_pool
|
||||
self.crawler_type =crawler_type
|
||||
|
||||
async def start(self) -> None:
|
||||
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
||||
@@ -72,8 +74,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await login_obj.begin()
|
||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
if self.crawler_type == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
crawler_type_var.set("search")
|
||||
await self.search()
|
||||
elif self.crawler_type == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
crawler_type_var.set("detail")
|
||||
await self.get_specified_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("Xhs Crawler finished ...")
|
||||
|
||||
@@ -82,8 +92,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
utils.logger.info("Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
# set keyword to context var
|
||||
request_keyword_var.set(keyword)
|
||||
utils.logger.info(f"Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
@@ -107,6 +115,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.SPECIFIED_ID_LIST
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
await xhs_model.update_xhs_note(note_detail)
|
||||
await self.batch_get_note_comments(config.SPECIFIED_ID_LIST)
|
||||
|
||||
|
||||
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""Get note detail"""
|
||||
async with semaphore:
|
||||
@@ -115,6 +136,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_list: List[str]):
|
||||
"""Batch get note comments"""
|
||||
|
||||
Reference in New Issue
Block a user