feat: support time deplay for all platform

This commit is contained in:
程序员阿江(Relakkes)
2025-09-02 16:43:09 +08:00
parent eb799e1fa7
commit 2bce3593f7
8 changed files with 151 additions and 48 deletions

View File

@@ -317,14 +317,3 @@ Thordata是全球代理IP解决方案提供商支持大规模采集公共网
## 6. 最终解释权 ## 6. 最终解释权
关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。 关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。
</div> </div>
## 🙏 致谢
### JetBrains 开源许可证支持
感谢 JetBrains 为本项目提供免费的开源许可证支持!
<a href="https://www.jetbrains.com/?from=MediaCrawler">
<img src="https://www.jetbrains.com/company/brand/img/jetbrains_logo.png" width="100" alt="JetBrains" />
</a>

View File

@@ -15,7 +15,7 @@
import asyncio import asyncio
import os import os
import random # import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -208,6 +208,11 @@ class BilibiliCrawler(AbstractCrawler):
await bilibili_store.update_up_info(video_item) await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore) await self.get_bilibili_video(video_item, semaphore)
page += 1 page += 1
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
await self.batch_get_video_comments(video_id_list) await self.batch_get_video_comments(video_id_list)
async def search_by_keywords_in_time_range(self, daily_limit: bool): async def search_by_keywords_in_time_range(self, daily_limit: bool):
@@ -284,6 +289,11 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_bilibili_video(video_item, semaphore) await self.get_bilibili_video(video_item, semaphore)
page += 1 page += 1
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
await self.batch_get_video_comments(video_id_list) await self.batch_get_video_comments(video_id_list)
except Exception as e: except Exception as e:
@@ -318,10 +328,11 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...") utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
await asyncio.sleep(random.uniform(0.5, 1.5)) await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
await self.bili_client.get_video_all_comments( await self.bili_client.get_video_all_comments(
video_id=video_id, video_id=video_id,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=bilibili_store.batch_update_bilibili_video_comments, callback=bilibili_store.batch_update_bilibili_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
@@ -347,7 +358,8 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_specified_videos(video_bvids_list) await self.get_specified_videos(video_bvids_list)
if int(result["page"]["count"]) <= pn * ps: if int(result["page"]["count"]) <= pn * ps:
break break
await asyncio.sleep(random.random()) await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
pn += 1 pn += 1
async def get_specified_videos(self, bvids_list: List[str]): async def get_specified_videos(self, bvids_list: List[str]):
@@ -381,6 +393,11 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid) result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
# Sleep after fetching video details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
return result return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}") utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
@@ -544,7 +561,8 @@ class BilibiliCrawler(AbstractCrawler):
return return
content = await self.bili_client.get_video_media(video_url) content = await self.bili_client.get_video_media(video_url)
await asyncio.sleep(random.random()) await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
if content is None: if content is None:
return return
extension_file_name = f"video.mp4" extension_file_name = f"video.mp4"
@@ -600,7 +618,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...") utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
await self.bili_client.get_creator_all_fans( await self.bili_client.get_creator_all_fans(
creator_info=creator_info, creator_info=creator_info,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_fans, callback=bilibili_store.batch_update_bilibili_creator_fans,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
) )
@@ -623,7 +641,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...") utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
await self.bili_client.get_creator_all_followings( await self.bili_client.get_creator_all_followings(
creator_info=creator_info, creator_info=creator_info,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_followings, callback=bilibili_store.batch_update_bilibili_creator_followings,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
) )
@@ -646,7 +664,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...") utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
await self.bili_client.get_creator_all_dynamics( await self.bili_client.get_creator_all_dynamics(
creator_info=creator_info, creator_info=creator_info,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_dynamics, callback=bilibili_store.batch_update_bilibili_creator_dynamics,
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
) )

View File

@@ -147,6 +147,9 @@ class DouYinCrawler(AbstractCrawler):
aweme_list.append(aweme_info.get("aweme_id", "")) aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info) await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
await self.get_aweme_media(aweme_item=aweme_info) await self.get_aweme_media(aweme_item=aweme_info)
# Sleep after each page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}") utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list) await self.batch_get_note_comments(aweme_list)
@@ -165,7 +168,11 @@ class DouYinCrawler(AbstractCrawler):
"""Get note detail""" """Get note detail"""
async with semaphore: async with semaphore:
try: try:
return await self.dy_client.get_video_by_id(aweme_id) result = await self.dy_client.get_video_by_id(aweme_id)
# Sleep after fetching aweme detail
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}")
return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}") utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None return None
@@ -193,13 +200,18 @@ class DouYinCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
# 将关键词列表传递给 get_aweme_all_comments 方法 # 将关键词列表传递给 get_aweme_all_comments 方法
# Use fixed crawling interval
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
await self.dy_client.get_aweme_all_comments( await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id, aweme_id=aweme_id,
crawl_interval=random.random(), crawl_interval=crawl_interval,
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=douyin_store.batch_update_dy_aweme_comments, callback=douyin_store.batch_update_dy_aweme_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )
# Sleep after fetching comments
await asyncio.sleep(crawl_interval)
utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}")
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e: except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")

View File

@@ -11,7 +11,7 @@
import asyncio import asyncio
import os import os
import random # import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
import time import time
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@@ -159,6 +159,11 @@ class KuaishouCrawler(AbstractCrawler):
# batch fetch video comments # batch fetch video comments
page += 1 page += 1
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[KuaishouCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
await self.batch_get_video_comments(video_id_list) await self.batch_get_video_comments(video_id_list)
async def get_specified_videos(self): async def get_specified_videos(self):
@@ -181,6 +186,11 @@ class KuaishouCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
result = await self.ks_client.get_video_info(video_id) result = await self.ks_client.get_video_info(video_id)
# Sleep after fetching video details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
utils.logger.info( utils.logger.info(
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..." f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
) )
@@ -234,9 +244,14 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info( utils.logger.info(
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..." f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
) )
# Sleep before fetching comments
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[KuaishouCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for video {video_id}")
await self.ks_client.get_video_all_comments( await self.ks_client.get_video_all_comments(
photo_id=video_id, photo_id=video_id,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=kuaishou_store.batch_update_ks_video_comments, callback=kuaishou_store.batch_update_ks_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )

View File

@@ -11,7 +11,7 @@
import asyncio import asyncio
import os import os
import random # import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@@ -141,6 +141,11 @@ class TieBaCrawler(AbstractCrawler):
await self.get_specified_notes( await self.get_specified_notes(
note_id_list=[note_detail.note_id for note_detail in notes_list] note_id_list=[note_detail.note_id for note_detail in notes_list]
) )
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}")
page += 1 page += 1
except Exception as ex: except Exception as ex:
utils.logger.error( utils.logger.error(
@@ -178,6 +183,11 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}" f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
) )
await self.get_specified_notes([note.note_id for note in note_list]) await self.get_specified_notes([note.note_id for note in note_list])
# Sleep after processing notes
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}")
page_number += tieba_limit_count page_number += tieba_limit_count
async def get_specified_notes( async def get_specified_notes(
@@ -222,6 +232,11 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}" f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
) )
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id) note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
# Sleep after fetching note details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
if not note_detail: if not note_detail:
utils.logger.error( utils.logger.error(
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}" f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
@@ -277,9 +292,14 @@ class TieBaCrawler(AbstractCrawler):
utils.logger.info( utils.logger.info(
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}" f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
) )
# Sleep before fetching comments
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}")
await self.tieba_client.get_note_all_comments( await self.tieba_client.get_note_all_comments(
note_detail=note_detail, note_detail=note_detail,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=tieba_store.batch_update_tieba_note_comments, callback=tieba_store.batch_update_tieba_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )

View File

@@ -15,7 +15,7 @@
import asyncio import asyncio
import os import os
import random # import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@@ -160,6 +160,11 @@ class WeiboCrawler(AbstractCrawler):
await self.get_note_images(mblog) await self.get_note_images(mblog)
page += 1 page += 1
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
await self.batch_get_notes_comments(note_id_list) await self.batch_get_notes_comments(note_id_list)
async def get_specified_notes(self): async def get_specified_notes(self):
@@ -185,6 +190,11 @@ class WeiboCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
result = await self.wb_client.get_note_info_by_id(note_id) result = await self.wb_client.get_note_info_by_id(note_id)
# Sleep after fetching note details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
return result return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}") utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
@@ -221,9 +231,14 @@ class WeiboCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...") utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
# Sleep before fetching comments
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
await self.wb_client.get_note_all_comments( await self.wb_client.get_note_all_comments(
note_id=note_id, note_id=note_id,
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重所以延时提高一些 crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # Use fixed interval instead of random
callback=weibo_store.batch_update_weibo_note_comments, callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )
@@ -250,7 +265,8 @@ class WeiboCrawler(AbstractCrawler):
if not url: if not url:
continue continue
content = await self.wb_client.get_note_image(url) content = await self.wb_client.get_note_image(url)
await asyncio.sleep(random.random()) await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
if content != None: if content != None:
extension_file_name = url.split(".")[-1] extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name) await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)

View File

@@ -11,9 +11,8 @@
import asyncio import asyncio
import os import os
import random import random
import time
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional
from playwright.async_api import ( from playwright.async_api import (
BrowserContext, BrowserContext,
@@ -164,6 +163,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
page += 1 page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_ids, xsec_tokens) await self.batch_get_note_comments(note_ids, xsec_tokens)
# Sleep after each page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[XiaoHongShuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
except DataFetchError: except DataFetchError:
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error") utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
break break
@@ -177,11 +180,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
if createor_info: if createor_info:
await xhs_store.save_creator(user_id, creator=createor_info) await xhs_store.save_creator(user_id, creator=createor_info)
# When proxy is not enabled, increase the crawling interval # Use fixed crawling interval
if config.ENABLE_IP_PROXY: crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
crawl_interval = random.random()
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
# Get all note information of the creator # Get all note information of the creator
all_notes_list = await self.xhs_client.get_all_notes_by_creator( all_notes_list = await self.xhs_client.get_all_notes_by_creator(
user_id=user_id, user_id=user_id,
@@ -280,6 +280,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
# Sleep after fetching note detail
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note {note_id}")
return note_detail return note_detail
except DataFetchError as ex: except DataFetchError as ex:
@@ -310,11 +315,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""Get note comments with keyword filtering and quantity limitation""" """Get note comments with keyword filtering and quantity limitation"""
async with semaphore: async with semaphore:
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}") utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
# When proxy is not enabled, increase the crawling interval # Use fixed crawling interval
if config.ENABLE_IP_PROXY: crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
crawl_interval = random.random()
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
await self.xhs_client.get_note_all_comments( await self.xhs_client.get_note_all_comments(
note_id=note_id, note_id=note_id,
xsec_token=xsec_token, xsec_token=xsec_token,
@@ -322,6 +324,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
callback=xhs_store.batch_update_xhs_note_comments, callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )
# Sleep after fetching comments
await asyncio.sleep(crawl_interval)
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for note {note_id}")
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient: async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
"""Create xhs client""" """Create xhs client"""

View File

@@ -12,7 +12,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import asyncio import asyncio
import os import os
import random # import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple, cast from typing import Dict, List, Optional, Tuple, cast
@@ -170,6 +170,10 @@ class ZhihuCrawler(AbstractCrawler):
utils.logger.info("No more content!") utils.logger.info("No more content!")
break break
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[ZhihuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
page += 1 page += 1
for content in content_list: for content in content_list:
await zhihu_store.update_zhihu_content(content) await zhihu_store.update_zhihu_content(content)
@@ -219,9 +223,14 @@ class ZhihuCrawler(AbstractCrawler):
utils.logger.info( utils.logger.info(
f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}" f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}"
) )
# Sleep before fetching comments
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[ZhihuCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for content {content_item.content_id}")
await self.zhihu_client.get_note_all_comments( await self.zhihu_client.get_note_all_comments(
content=content_item, content=content_item,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=zhihu_store.batch_update_zhihu_note_comments, callback=zhihu_store.batch_update_zhihu_note_comments,
) )
@@ -259,21 +268,21 @@ class ZhihuCrawler(AbstractCrawler):
# Get all anwser information of the creator # Get all anwser information of the creator
all_content_list = await self.zhihu_client.get_all_anwser_by_creator( all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
creator=createor_info, creator=createor_info,
crawl_interval=random.random(), crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=zhihu_store.batch_update_zhihu_contents, callback=zhihu_store.batch_update_zhihu_contents,
) )
# Get all articles of the creator's contents # Get all articles of the creator's contents
# all_content_list = await self.zhihu_client.get_all_articles_by_creator( # all_content_list = await self.zhihu_client.get_all_articles_by_creator(
# creator=createor_info, # creator=createor_info,
# crawl_interval=random.random(), # crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
# callback=zhihu_store.batch_update_zhihu_contents # callback=zhihu_store.batch_update_zhihu_contents
# ) # )
# Get all videos of the creator's contents # Get all videos of the creator's contents
# all_content_list = await self.zhihu_client.get_all_videos_by_creator( # all_content_list = await self.zhihu_client.get_all_videos_by_creator(
# creator=createor_info, # creator=createor_info,
# crawl_interval=random.random(), # crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
# callback=zhihu_store.batch_update_zhihu_contents # callback=zhihu_store.batch_update_zhihu_contents
# ) # )
@@ -304,21 +313,39 @@ class ZhihuCrawler(AbstractCrawler):
utils.logger.info( utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}" f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
) )
return await self.zhihu_client.get_answer_info(question_id, answer_id) result = await self.zhihu_client.get_answer_info(question_id, answer_id)
# Sleep after fetching answer details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching answer details {answer_id}")
return result
elif note_type == constant.ARTICLE_NAME: elif note_type == constant.ARTICLE_NAME:
article_id = full_note_url.split("/")[-1] article_id = full_note_url.split("/")[-1]
utils.logger.info( utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}" f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
) )
return await self.zhihu_client.get_article_info(article_id) result = await self.zhihu_client.get_article_info(article_id)
# Sleep after fetching article details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching article details {article_id}")
return result
elif note_type == constant.VIDEO_NAME: elif note_type == constant.VIDEO_NAME:
video_id = full_note_url.split("/")[-1] video_id = full_note_url.split("/")[-1]
utils.logger.info( utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}" f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
) )
return await self.zhihu_client.get_video_info(video_id) result = await self.zhihu_client.get_video_info(video_id)
# Sleep after fetching video details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
return result
async def get_specified_notes(self): async def get_specified_notes(self):
""" """