mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 15:11:12 +08:00
feat: support time deplay for all platform
This commit is contained in:
11
README.md
11
README.md
@@ -317,14 +317,3 @@ Thordata是全球代理IP解决方案提供商,支持大规模采集公共网
|
||||
## 6. 最终解释权
|
||||
关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。
|
||||
</div>
|
||||
|
||||
|
||||
## 🙏 致谢
|
||||
|
||||
### JetBrains 开源许可证支持
|
||||
|
||||
感谢 JetBrains 为本项目提供免费的开源许可证支持!
|
||||
|
||||
<a href="https://www.jetbrains.com/?from=MediaCrawler">
|
||||
<img src="https://www.jetbrains.com/company/brand/img/jetbrains_logo.png" width="100" alt="JetBrains" />
|
||||
</a>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from datetime import datetime, timedelta
|
||||
@@ -208,6 +208,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await bilibili_store.update_up_info(video_item)
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
page += 1
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def search_by_keywords_in_time_range(self, daily_limit: bool):
|
||||
@@ -284,6 +289,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
|
||||
page += 1
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
except Exception as e:
|
||||
@@ -318,10 +328,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
await asyncio.sleep(random.uniform(0.5, 1.5))
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
|
||||
await self.bili_client.get_video_all_comments(
|
||||
video_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=bilibili_store.batch_update_bilibili_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
@@ -347,7 +358,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await self.get_specified_videos(video_bvids_list)
|
||||
if int(result["page"]["count"]) <= pn * ps:
|
||||
break
|
||||
await asyncio.sleep(random.random())
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
|
||||
pn += 1
|
||||
|
||||
async def get_specified_videos(self, bvids_list: List[str]):
|
||||
@@ -381,6 +393,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
|
||||
|
||||
# Sleep after fetching video details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
|
||||
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
@@ -544,7 +561,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
return
|
||||
|
||||
content = await self.bili_client.get_video_media(video_url)
|
||||
await asyncio.sleep(random.random())
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
@@ -600,7 +618,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
|
||||
await self.bili_client.get_creator_all_fans(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=bilibili_store.batch_update_bilibili_creator_fans,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
@@ -623,7 +641,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
|
||||
await self.bili_client.get_creator_all_followings(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=bilibili_store.batch_update_bilibili_creator_followings,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
@@ -646,7 +664,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
|
||||
await self.bili_client.get_creator_all_dynamics(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
|
||||
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
@@ -147,6 +147,9 @@ class DouYinCrawler(AbstractCrawler):
|
||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
||||
await self.get_aweme_media(aweme_item=aweme_info)
|
||||
# Sleep after each page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
@@ -165,7 +168,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||
"""Get note detail"""
|
||||
async with semaphore:
|
||||
try:
|
||||
return await self.dy_client.get_video_by_id(aweme_id)
|
||||
result = await self.dy_client.get_video_by_id(aweme_id)
|
||||
# Sleep after fetching aweme detail
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}")
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
|
||||
return None
|
||||
@@ -193,13 +200,18 @@ class DouYinCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||
# Use fixed crawling interval
|
||||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=crawl_interval,
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=douyin_store.batch_update_dy_aweme_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
# Sleep after fetching comments
|
||||
await asyncio.sleep(crawl_interval)
|
||||
utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}")
|
||||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
@@ -159,6 +159,11 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
# batch fetch video comments
|
||||
page += 1
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[KuaishouCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def get_specified_videos(self):
|
||||
@@ -181,6 +186,11 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.ks_client.get_video_info(video_id)
|
||||
|
||||
# Sleep after fetching video details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
|
||||
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
|
||||
)
|
||||
@@ -234,9 +244,14 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
|
||||
)
|
||||
|
||||
# Sleep before fetching comments
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[KuaishouCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for video {video_id}")
|
||||
|
||||
await self.ks_client.get_video_all_comments(
|
||||
photo_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=kuaishou_store.batch_update_ks_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
@@ -141,6 +141,11 @@ class TieBaCrawler(AbstractCrawler):
|
||||
await self.get_specified_notes(
|
||||
note_id_list=[note_detail.note_id for note_detail in notes_list]
|
||||
)
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}")
|
||||
|
||||
page += 1
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
@@ -178,6 +183,11 @@ class TieBaCrawler(AbstractCrawler):
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
|
||||
)
|
||||
await self.get_specified_notes([note.note_id for note in note_list])
|
||||
|
||||
# Sleep after processing notes
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}")
|
||||
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(
|
||||
@@ -222,6 +232,11 @@ class TieBaCrawler(AbstractCrawler):
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
|
||||
)
|
||||
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
|
||||
|
||||
# Sleep after fetching note details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
|
||||
|
||||
if not note_detail:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
|
||||
@@ -277,9 +292,14 @@ class TieBaCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
|
||||
)
|
||||
|
||||
# Sleep before fetching comments
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}")
|
||||
|
||||
await self.tieba_client.get_note_all_comments(
|
||||
note_detail=note_detail,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=tieba_store.batch_update_tieba_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
@@ -160,6 +160,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||
await self.get_note_images(mblog)
|
||||
|
||||
page += 1
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
await self.batch_get_notes_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
@@ -185,6 +190,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.wb_client.get_note_info_by_id(note_id)
|
||||
|
||||
# Sleep after fetching note details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
|
||||
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
|
||||
@@ -221,9 +231,14 @@ class WeiboCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
||||
|
||||
# Sleep before fetching comments
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
|
||||
|
||||
await self.wb_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # Use fixed interval instead of random
|
||||
callback=weibo_store.batch_update_weibo_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
@@ -250,7 +265,8 @@ class WeiboCrawler(AbstractCrawler):
|
||||
if not url:
|
||||
continue
|
||||
content = await self.wb_client.get_note_image(url)
|
||||
await asyncio.sleep(random.random())
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
|
||||
if content != None:
|
||||
extension_file_name = url.split(".")[-1]
|
||||
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
||||
|
||||
@@ -11,9 +11,8 @@
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
@@ -164,6 +163,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
page += 1
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_ids, xsec_tokens)
|
||||
|
||||
# Sleep after each page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
except DataFetchError:
|
||||
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
|
||||
break
|
||||
@@ -177,11 +180,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
if createor_info:
|
||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
# Use fixed crawling interval
|
||||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
|
||||
user_id=user_id,
|
||||
@@ -280,6 +280,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
|
||||
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
||||
|
||||
# Sleep after fetching note detail
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note {note_id}")
|
||||
|
||||
return note_detail
|
||||
|
||||
except DataFetchError as ex:
|
||||
@@ -310,11 +315,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
"""Get note comments with keyword filtering and quantity limitation"""
|
||||
async with semaphore:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
# Use fixed crawling interval
|
||||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||||
await self.xhs_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
xsec_token=xsec_token,
|
||||
@@ -323,6 +325,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
# Sleep after fetching comments
|
||||
await asyncio.sleep(crawl_interval)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for note {note_id}")
|
||||
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
@@ -170,6 +170,10 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info("No more content!")
|
||||
break
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[ZhihuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
page += 1
|
||||
for content in content_list:
|
||||
await zhihu_store.update_zhihu_content(content)
|
||||
@@ -219,9 +223,14 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}"
|
||||
)
|
||||
|
||||
# Sleep before fetching comments
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[ZhihuCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for content {content_item.content_id}")
|
||||
|
||||
await self.zhihu_client.get_note_all_comments(
|
||||
content=content_item,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=zhihu_store.batch_update_zhihu_note_comments,
|
||||
)
|
||||
|
||||
@@ -259,21 +268,21 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
# Get all anwser information of the creator
|
||||
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
|
||||
creator=createor_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=zhihu_store.batch_update_zhihu_contents,
|
||||
)
|
||||
|
||||
# Get all articles of the creator's contents
|
||||
# all_content_list = await self.zhihu_client.get_all_articles_by_creator(
|
||||
# creator=createor_info,
|
||||
# crawl_interval=random.random(),
|
||||
# crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
# callback=zhihu_store.batch_update_zhihu_contents
|
||||
# )
|
||||
|
||||
# Get all videos of the creator's contents
|
||||
# all_content_list = await self.zhihu_client.get_all_videos_by_creator(
|
||||
# creator=createor_info,
|
||||
# crawl_interval=random.random(),
|
||||
# crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
# callback=zhihu_store.batch_update_zhihu_contents
|
||||
# )
|
||||
|
||||
@@ -304,21 +313,39 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_answer_info(question_id, answer_id)
|
||||
result = await self.zhihu_client.get_answer_info(question_id, answer_id)
|
||||
|
||||
# Sleep after fetching answer details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching answer details {answer_id}")
|
||||
|
||||
return result
|
||||
|
||||
elif note_type == constant.ARTICLE_NAME:
|
||||
article_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_article_info(article_id)
|
||||
result = await self.zhihu_client.get_article_info(article_id)
|
||||
|
||||
# Sleep after fetching article details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching article details {article_id}")
|
||||
|
||||
return result
|
||||
|
||||
elif note_type == constant.VIDEO_NAME:
|
||||
video_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_video_info(video_id)
|
||||
result = await self.zhihu_client.get_video_info(video_id)
|
||||
|
||||
# Sleep after fetching video details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[ZhihuCrawler.get_note_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
|
||||
|
||||
return result
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user