mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-26 20:47:28 +08:00
fix(douyin): fetch comments concurrently after each page instead of waiting for all pages
- Moved batch_get_note_comments call inside the pagination loop - Comments are now fetched immediately after each page of videos is processed - This allows real-time observation of comment crawling progress - Improves data availability by not waiting for all video data to be collected first
This commit is contained in:
@@ -151,19 +151,24 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
|
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
|
||||||
break
|
break
|
||||||
dy_search_id = posts_res.get("extra", {}).get("logid", "")
|
dy_search_id = posts_res.get("extra", {}).get("logid", "")
|
||||||
|
page_aweme_list = []
|
||||||
for post_item in posts_res.get("data"):
|
for post_item in posts_res.get("data"):
|
||||||
try:
|
try:
|
||||||
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
||||||
except TypeError:
|
except TypeError:
|
||||||
continue
|
continue
|
||||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||||
|
page_aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
||||||
await self.get_aweme_media(aweme_item=aweme_info)
|
await self.get_aweme_media(aweme_item=aweme_info)
|
||||||
|
|
||||||
|
# Batch get note comments for the current page
|
||||||
|
await self.batch_get_note_comments(page_aweme_list)
|
||||||
|
|
||||||
# Sleep after each page navigation
|
# Sleep after each page navigation
|
||||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||||
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||||||
await self.batch_get_note_comments(aweme_list)
|
|
||||||
|
|
||||||
async def get_specified_awemes(self):
|
async def get_specified_awemes(self):
|
||||||
"""Get the information and comments of the specified post from URLs or IDs"""
|
"""Get the information and comments of the specified post from URLs or IDs"""
|
||||||
|
|||||||
Reference in New Issue
Block a user