From 2bce3593f7cfb6cd86bc187a28b5308089426655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Tue, 2 Sep 2025 16:43:09 +0800 Subject: [PATCH] feat: support time deplay for all platform --- README.md | 11 --------- media_platform/bilibili/core.py | 34 ++++++++++++++++++++------ media_platform/douyin/core.py | 16 ++++++++++-- media_platform/kuaishou/core.py | 19 +++++++++++++-- media_platform/tieba/core.py | 24 ++++++++++++++++-- media_platform/weibo/core.py | 22 ++++++++++++++--- media_platform/xhs/core.py | 30 ++++++++++++++--------- media_platform/zhihu/core.py | 43 +++++++++++++++++++++++++++------ 8 files changed, 151 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index e7b075f..b7a2cbd 100644 --- a/README.md +++ b/README.md @@ -317,14 +317,3 @@ Thordata是全球代理IP解决方案提供商,支持大规模采集公共网 ## 6. 最终解释权 关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。 - - -## 🙏 致谢 - -### JetBrains 开源许可证支持 - -感谢 JetBrains 为本项目提供免费的开源许可证支持! - - - JetBrains - diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index e63d31a..1c9c175 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -15,7 +15,7 @@ import asyncio import os -import random +# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals from asyncio import Task from typing import Dict, List, Optional, Tuple, Union from datetime import datetime, timedelta @@ -208,6 +208,11 @@ class BilibiliCrawler(AbstractCrawler): await bilibili_store.update_up_info(video_item) await self.get_bilibili_video(video_item, semaphore) page += 1 + + # Sleep after page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") + await self.batch_get_video_comments(video_id_list) async def search_by_keywords_in_time_range(self, daily_limit: bool): @@ -284,6 +289,11 @@ class BilibiliCrawler(AbstractCrawler): await self.get_bilibili_video(video_item, semaphore) page += 1 + + # Sleep after page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") + await self.batch_get_video_comments(video_id_list) except Exception as e: @@ -318,10 +328,11 @@ class BilibiliCrawler(AbstractCrawler): async with semaphore: try: utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...") - await asyncio.sleep(random.uniform(0.5, 1.5)) + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}") await self.bili_client.get_video_all_comments( video_id=video_id, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, callback=bilibili_store.batch_update_bilibili_video_comments, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, @@ -347,7 +358,8 @@ class BilibiliCrawler(AbstractCrawler): await self.get_specified_videos(video_bvids_list) if int(result["page"]["count"]) <= pn * ps: break - await asyncio.sleep(random.random()) + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}") pn += 1 async def get_specified_videos(self, bvids_list: List[str]): @@ -381,6 +393,11 @@ class BilibiliCrawler(AbstractCrawler): async with semaphore: try: result = await self.bili_client.get_video_info(aid=aid, bvid=bvid) + + # Sleep after fetching video details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}") + return result except DataFetchError as ex: utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}") @@ -544,7 +561,8 @@ class BilibiliCrawler(AbstractCrawler): return content = await self.bili_client.get_video_media(video_url) - await asyncio.sleep(random.random()) + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}") if content is None: return extension_file_name = f"video.mp4" @@ -600,7 +618,7 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...") await self.bili_client.get_creator_all_fans( creator_info=creator_info, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=bilibili_store.batch_update_bilibili_creator_fans, max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES, ) @@ -623,7 +641,7 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...") await self.bili_client.get_creator_all_followings( creator_info=creator_info, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=bilibili_store.batch_update_bilibili_creator_followings, max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES, ) @@ -646,7 +664,7 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...") await self.bili_client.get_creator_all_dynamics( creator_info=creator_info, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=bilibili_store.batch_update_bilibili_creator_dynamics, max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES, ) diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 1d7ce4d..191e6ab 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -147,6 +147,9 @@ class DouYinCrawler(AbstractCrawler): aweme_list.append(aweme_info.get("aweme_id", "")) await douyin_store.update_douyin_aweme(aweme_item=aweme_info) await self.get_aweme_media(aweme_item=aweme_info) + # Sleep after each page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}") await self.batch_get_note_comments(aweme_list) @@ -165,7 +168,11 @@ class DouYinCrawler(AbstractCrawler): """Get note detail""" async with semaphore: try: - return await self.dy_client.get_video_by_id(aweme_id) + result = await self.dy_client.get_video_by_id(aweme_id) + # Sleep after fetching aweme detail + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}") + return result except DataFetchError as ex: utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}") return None @@ -193,13 +200,18 @@ class DouYinCrawler(AbstractCrawler): async with semaphore: try: # 将关键词列表传递给 get_aweme_all_comments 方法 + # Use fixed crawling interval + crawl_interval = config.CRAWLER_MAX_SLEEP_SEC await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, - crawl_interval=random.random(), + crawl_interval=crawl_interval, is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, callback=douyin_store.batch_update_dy_aweme_comments, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) + # Sleep after fetching comments + await asyncio.sleep(crawl_interval) + utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}") utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") except DataFetchError as e: utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 4ae1d63..4e543cd 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -11,7 +11,7 @@ import asyncio import os -import random +# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals import time from asyncio import Task from typing import Dict, List, Optional, Tuple @@ -159,6 +159,11 @@ class KuaishouCrawler(AbstractCrawler): # batch fetch video comments page += 1 + + # Sleep after page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[KuaishouCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") + await self.batch_get_video_comments(video_id_list) async def get_specified_videos(self): @@ -181,6 +186,11 @@ class KuaishouCrawler(AbstractCrawler): async with semaphore: try: result = await self.ks_client.get_video_info(video_id) + + # Sleep after fetching video details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}") + utils.logger.info( f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..." ) @@ -234,9 +244,14 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.info( f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..." ) + + # Sleep before fetching comments + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[KuaishouCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for video {video_id}") + await self.ks_client.get_video_all_comments( photo_id=video_id, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=kuaishou_store.batch_update_ks_video_comments, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 8635104..c166495 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -11,7 +11,7 @@ import asyncio import os -import random +# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals from asyncio import Task from typing import Dict, List, Optional, Tuple @@ -141,6 +141,11 @@ class TieBaCrawler(AbstractCrawler): await self.get_specified_notes( note_id_list=[note_detail.note_id for note_detail in notes_list] ) + + # Sleep after page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}") + page += 1 except Exception as ex: utils.logger.error( @@ -178,6 +183,11 @@ class TieBaCrawler(AbstractCrawler): f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}" ) await self.get_specified_notes([note.note_id for note in note_list]) + + # Sleep after processing notes + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}") + page_number += tieba_limit_count async def get_specified_notes( @@ -222,6 +232,11 @@ class TieBaCrawler(AbstractCrawler): f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}" ) note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id) + + # Sleep after fetching note details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}") + if not note_detail: utils.logger.error( f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}" @@ -277,9 +292,14 @@ class TieBaCrawler(AbstractCrawler): utils.logger.info( f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}" ) + + # Sleep before fetching comments + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}") + await self.tieba_client.get_note_all_comments( note_detail=note_detail, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=tieba_store.batch_update_tieba_note_comments, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 552801f..d502386 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -15,7 +15,7 @@ import asyncio import os -import random +# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals from asyncio import Task from typing import Dict, List, Optional, Tuple @@ -160,6 +160,11 @@ class WeiboCrawler(AbstractCrawler): await self.get_note_images(mblog) page += 1 + + # Sleep after page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") + await self.batch_get_notes_comments(note_id_list) async def get_specified_notes(self): @@ -185,6 +190,11 @@ class WeiboCrawler(AbstractCrawler): async with semaphore: try: result = await self.wb_client.get_note_info_by_id(note_id) + + # Sleep after fetching note details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}") + return result except DataFetchError as ex: utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}") @@ -221,9 +231,14 @@ class WeiboCrawler(AbstractCrawler): async with semaphore: try: utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...") + + # Sleep before fetching comments + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}") + await self.wb_client.get_note_all_comments( note_id=note_id, - crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些 + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # Use fixed interval instead of random callback=weibo_store.batch_update_weibo_note_comments, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) @@ -250,7 +265,8 @@ class WeiboCrawler(AbstractCrawler): if not url: continue content = await self.wb_client.get_note_image(url) - await asyncio.sleep(random.random()) + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image") if content != None: extension_file_name = url.split(".")[-1] await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 9c88f1c..f228392 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -11,9 +11,8 @@ import asyncio import os import random -import time from asyncio import Task -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional from playwright.async_api import ( BrowserContext, @@ -164,6 +163,10 @@ class XiaoHongShuCrawler(AbstractCrawler): page += 1 utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") await self.batch_get_note_comments(note_ids, xsec_tokens) + + # Sleep after each page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[XiaoHongShuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") except DataFetchError: utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error") break @@ -177,11 +180,8 @@ class XiaoHongShuCrawler(AbstractCrawler): if createor_info: await xhs_store.save_creator(user_id, creator=createor_info) - # When proxy is not enabled, increase the crawling interval - if config.ENABLE_IP_PROXY: - crawl_interval = random.random() - else: - crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) + # Use fixed crawling interval + crawl_interval = config.CRAWLER_MAX_SLEEP_SEC # Get all note information of the creator all_notes_list = await self.xhs_client.get_all_notes_by_creator( user_id=user_id, @@ -280,6 +280,11 @@ class XiaoHongShuCrawler(AbstractCrawler): raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) + + # Sleep after fetching note detail + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note {note_id}") + return note_detail except DataFetchError as ex: @@ -310,11 +315,8 @@ class XiaoHongShuCrawler(AbstractCrawler): """Get note comments with keyword filtering and quantity limitation""" async with semaphore: utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}") - # When proxy is not enabled, increase the crawling interval - if config.ENABLE_IP_PROXY: - crawl_interval = random.random() - else: - crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) + # Use fixed crawling interval + crawl_interval = config.CRAWLER_MAX_SLEEP_SEC await self.xhs_client.get_note_all_comments( note_id=note_id, xsec_token=xsec_token, @@ -322,6 +324,10 @@ class XiaoHongShuCrawler(AbstractCrawler): callback=xhs_store.batch_update_xhs_note_comments, max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) + + # Sleep after fetching comments + await asyncio.sleep(crawl_interval) + utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for note {note_id}") async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient: """Create xhs client""" diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py index 9ef72b6..ea87e1c 100644 --- a/media_platform/zhihu/core.py +++ b/media_platform/zhihu/core.py @@ -12,7 +12,7 @@ # -*- coding: utf-8 -*- import asyncio import os -import random +# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals from asyncio import Task from typing import Dict, List, Optional, Tuple, cast @@ -170,6 +170,10 @@ class ZhihuCrawler(AbstractCrawler): utils.logger.info("No more content!") break + # Sleep after page navigation + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[ZhihuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}") + page += 1 for content in content_list: await zhihu_store.update_zhihu_content(content) @@ -219,9 +223,14 @@ class ZhihuCrawler(AbstractCrawler): utils.logger.info( f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}" ) + + # Sleep before fetching comments + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[ZhihuCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for content {content_item.content_id}") + await self.zhihu_client.get_note_all_comments( content=content_item, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=zhihu_store.batch_update_zhihu_note_comments, ) @@ -259,21 +268,21 @@ class ZhihuCrawler(AbstractCrawler): # Get all anwser information of the creator all_content_list = await self.zhihu_client.get_all_anwser_by_creator( creator=createor_info, - crawl_interval=random.random(), + crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, callback=zhihu_store.batch_update_zhihu_contents, ) # Get all articles of the creator's contents # all_content_list = await self.zhihu_client.get_all_articles_by_creator( # creator=createor_info, - # crawl_interval=random.random(), + # crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # callback=zhihu_store.batch_update_zhihu_contents # ) # Get all videos of the creator's contents # all_content_list = await self.zhihu_client.get_all_videos_by_creator( # creator=createor_info, - # crawl_interval=random.random(), + # crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # callback=zhihu_store.batch_update_zhihu_contents # ) @@ -304,21 +313,39 @@ class ZhihuCrawler(AbstractCrawler): utils.logger.info( f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}" ) - return await self.zhihu_client.get_answer_info(question_id, answer_id) + result = await self.zhihu_client.get_answer_info(question_id, answer_id) + + # Sleep after fetching answer details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching answer details {answer_id}") + + return result elif note_type == constant.ARTICLE_NAME: article_id = full_note_url.split("/")[-1] utils.logger.info( f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}" ) - return await self.zhihu_client.get_article_info(article_id) + result = await self.zhihu_client.get_article_info(article_id) + + # Sleep after fetching article details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching article details {article_id}") + + return result elif note_type == constant.VIDEO_NAME: video_id = full_note_url.split("/")[-1] utils.logger.info( f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}" ) - return await self.zhihu_client.get_video_info(video_id) + result = await self.zhihu_client.get_video_info(video_id) + + # Sleep after fetching video details + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}") + + return result async def get_specified_notes(self): """