fix(douyin): fetch comments concurrently after each page instead of waiting for all pages

- Moved batch_get_note_comments call inside the pagination loop
- Comments are now fetched immediately after each page of videos is processed
- This allows real-time observation of comment crawling progress
- Improves data availability by not waiting for all video data to be collected first
This commit is contained in:
Caelan_Windows
2026-01-03 01:47:24 +08:00
parent a47c119303
commit c56b8c4c5d

View File

@@ -151,19 +151,24 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed账号也许被风控了。")
break
dy_search_id = posts_res.get("extra", {}).get("logid", "")
page_aweme_list = []
for post_item in posts_res.get("data"):
try:
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
except TypeError:
continue
aweme_list.append(aweme_info.get("aweme_id", ""))
page_aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
await self.get_aweme_media(aweme_item=aweme_info)
# Batch get note comments for the current page
await self.batch_get_note_comments(page_aweme_list)
# Sleep after each page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self):
"""Get the information and comments of the specified post from URLs or IDs"""