Merge pull request #802 from Cae1anSou/fix/douyin-concurrent-comments

fix: fetch Douyin comments concurrently after each page instead of waiting for all pages
This commit is contained in:
程序员阿江-Relakkes
2026-01-03 22:38:26 +08:00
committed by GitHub

View File

@@ -151,19 +151,24 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed账号也许被风控了。")
break
dy_search_id = posts_res.get("extra", {}).get("logid", "")
page_aweme_list = []
for post_item in posts_res.get("data"):
try:
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
except TypeError:
continue
aweme_list.append(aweme_info.get("aweme_id", ""))
page_aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
await self.get_aweme_media(aweme_item=aweme_info)
# Batch get note comments for the current page
await self.batch_get_note_comments(page_aweme_list)
# Sleep after each page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self):
"""Get the information and comments of the specified post from URLs or IDs"""