fix(douyin): fetch comments concurrently after each page instead of waiting for all pages

- Moved batch_get_note_comments call inside the pagination loop - Comments are now fetched immediately after each page of videos is processed - This allows real-time observation of comment crawling progress - Improves data availability by not waiting for all video data to be collected first
2026-05-26 20:47:28 +08:00 · 2026-01-03 01:47:24 +08:00
parent a47c119303
commit c56b8c4c5d
1 changed files with 6 additions and 1 deletions
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -151,19 +151,24 @@ class DouYinCrawler(AbstractCrawler):
                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed，账号也许被风控了。")
                    break
                dy_search_id = posts_res.get("extra", {}).get("logid", "")
                page_aweme_list = []
                for post_item in posts_res.get("data"):
                    try:
                        aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
                    except TypeError:
                        continue
                    aweme_list.append(aweme_info.get("aweme_id", ""))
                    page_aweme_list.append(aweme_info.get("aweme_id", ""))
                    await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
                    await self.get_aweme_media(aweme_item=aweme_info)
                # Batch get note comments for the current page
                await self.batch_get_note_comments(page_aweme_list)
                # Sleep after each page navigation
                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
                utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
            utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
            await self.batch_get_note_comments(aweme_list)
    async def get_specified_awemes(self):
        """Get the information and comments of the specified post from URLs or IDs"""