From c56b8c4c5dc8400c154d2afe29f5b27ad20982b7 Mon Sep 17 00:00:00 2001 From: Caelan_Windows Date: Sat, 3 Jan 2026 01:47:24 +0800 Subject: [PATCH] fix(douyin): fetch comments concurrently after each page instead of waiting for all pages - Moved batch_get_note_comments call inside the pagination loop - Comments are now fetched immediately after each page of videos is processed - This allows real-time observation of comment crawling progress - Improves data availability by not waiting for all video data to be collected first --- media_platform/douyin/core.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 3a0ec5d..9308b0f 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -151,19 +151,24 @@ class DouYinCrawler(AbstractCrawler): utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。") break dy_search_id = posts_res.get("extra", {}).get("logid", "") + page_aweme_list = [] for post_item in posts_res.get("data"): try: aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0]) except TypeError: continue aweme_list.append(aweme_info.get("aweme_id", "")) + page_aweme_list.append(aweme_info.get("aweme_id", "")) await douyin_store.update_douyin_aweme(aweme_item=aweme_info) await self.get_aweme_media(aweme_item=aweme_info) + + # Batch get note comments for the current page + await self.batch_get_note_comments(page_aweme_list) + # Sleep after each page navigation await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}") - await self.batch_get_note_comments(aweme_list) async def get_specified_awemes(self): """Get the information and comments of the specified post from URLs or IDs"""