mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 15:11:12 +08:00
feat: Enhance Bilibili crawler with retry logic and robustness
This commit introduces several improvements to enhance the stability and functionality of the Bilibili crawler. - **Add Retry Logic:** Implement a retry mechanism with exponential backoff when fetching video comments. This makes the crawler more resilient to transient network issues or API errors. - **Improve Error Handling:** Add a `try...except` block to handle potential `JSONDecodeError` in the Bilibili client, preventing crashes when the API returns an invalid response. - **Ensure Clean Shutdown:** Refactor `main.py` to use a `try...finally` block, guaranteeing that the crawler and database connections are properly closed on exit, error, or `KeyboardInterrupt`. - **Update Default Config:** Adjust default configuration values to increase concurrency, enable word cloud generation by default, and refine the Bilibili search mode for more practical usage.
This commit is contained in:
@@ -90,7 +90,7 @@ CRAWLER_MAX_NOTES_COUNT = 200
|
||||
MAX_NOTES_PER_DAY = 1
|
||||
|
||||
# 并发爬虫数量控制
|
||||
MAX_CONCURRENCY_NUM = 1
|
||||
MAX_CONCURRENCY_NUM = 5
|
||||
|
||||
# 是否开启爬图片模式, 默认不开启爬图片
|
||||
ENABLE_GET_IMAGES = False
|
||||
@@ -99,7 +99,7 @@ ENABLE_GET_IMAGES = False
|
||||
ENABLE_GET_COMMENTS = True
|
||||
|
||||
# 爬取一级评论的数量控制(单视频/帖子)
|
||||
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 1
|
||||
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 6
|
||||
|
||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||
@@ -202,7 +202,7 @@ ZHIHU_SPECIFIED_ID_LIST = [
|
||||
|
||||
# 词云相关
|
||||
# 是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
ENABLE_GET_WORDCLOUD = True
|
||||
# 自定义词语及其分组
|
||||
# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
CUSTOM_WORDS = {
|
||||
@@ -220,13 +220,13 @@ FONT_PATH = "./docs/STZHONGS.TTF"
|
||||
START_DAY = "2024-01-01"
|
||||
|
||||
# 爬取结束的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式
|
||||
END_DAY = "2024-01-01"
|
||||
END_DAY = "2025-07-12"
|
||||
|
||||
# Bilibili 搜索模式,仅在 CRAWLER_TYPE="search" 时生效
|
||||
# "normal": 不指定时间范围进行搜索,最多返回约1000条结果。
|
||||
# "all_in_time_range": 在 START_DAY 和 END_DAY 指定的时间范围内,尽可能多地爬取数据,每日上限受 MAX_NOTES_PER_DAY 影响,但总数可能超过 CRAWLER_MAX_NOTES_COUNT。
|
||||
# "daily_limit_in_time_range": 在指定时间范围内,严格遵守 MAX_NOTES_PER_DAY 的每日上限和 CRAWLER_MAX_NOTES_COUNT 的总上限。
|
||||
BILI_SEARCH_MODE = "all_in_time_range"
|
||||
BILI_SEARCH_MODE = "daily_limit_in_time_range"
|
||||
|
||||
#!!! 下面仅支持 bilibili creator搜索
|
||||
# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
|
||||
|
||||
28
main.py
28
main.py
@@ -45,25 +45,29 @@ class CrawlerFactory:
|
||||
return crawler_class()
|
||||
|
||||
async def main():
|
||||
# Init crawler
|
||||
crawler: Optional[AbstractCrawler] = None
|
||||
try:
|
||||
# parse cmd
|
||||
await cmd_arg.parse_cmd()
|
||||
|
||||
# parse cmd
|
||||
await cmd_arg.parse_cmd()
|
||||
# init db
|
||||
if config.SAVE_DATA_OPTION == "db":
|
||||
await db.init_db()
|
||||
|
||||
# init db
|
||||
if config.SAVE_DATA_OPTION == "db":
|
||||
await db.init_db()
|
||||
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
||||
await crawler.start()
|
||||
finally:
|
||||
if crawler:
|
||||
await crawler.close()
|
||||
if config.SAVE_DATA_OPTION == "db":
|
||||
await db.close()
|
||||
|
||||
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
||||
await crawler.start()
|
||||
|
||||
if config.SAVE_DATA_OPTION == "db":
|
||||
await db.close()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
# asyncio.run(main())
|
||||
asyncio.get_event_loop().run_until_complete(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\n[main] Caught keyboard interrupt, exiting.")
|
||||
sys.exit()
|
||||
|
||||
@@ -53,7 +53,11 @@ class BilibiliClient(AbstractApiClient):
|
||||
method, url, timeout=self.timeout,
|
||||
**kwargs
|
||||
)
|
||||
data: Dict = response.json()
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except json.JSONDecodeError:
|
||||
utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}")
|
||||
raise DataFetchError(f"Failed to decode JSON, content: {response.text}")
|
||||
if data.get("code") != 0:
|
||||
raise DataFetchError(data.get("message", "unkonw error"))
|
||||
else:
|
||||
@@ -235,13 +239,37 @@ class BilibiliClient(AbstractApiClient):
|
||||
|
||||
:return:
|
||||
"""
|
||||
|
||||
result = []
|
||||
is_end = False
|
||||
next_page = 0
|
||||
max_retries = 3
|
||||
while not is_end and len(result) < max_count:
|
||||
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
|
||||
comments_res = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
|
||||
break # Success
|
||||
except DataFetchError as e:
|
||||
if attempt < max_retries - 1:
|
||||
delay = 5 * (2 ** attempt) + random.uniform(0, 1)
|
||||
utils.logger.warning(
|
||||
f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})"
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
utils.logger.error(
|
||||
f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}"
|
||||
)
|
||||
is_end = True
|
||||
break
|
||||
if not comments_res:
|
||||
break
|
||||
|
||||
cursor_info: Dict = comments_res.get("cursor")
|
||||
if not cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.")
|
||||
break
|
||||
|
||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||
is_end = cursor_info.get("is_end")
|
||||
next_page = cursor_info.get("next")
|
||||
|
||||
@@ -23,6 +23,7 @@ from datetime import datetime, timedelta
|
||||
import pandas as pd
|
||||
|
||||
from playwright.async_api import (BrowserContext, BrowserType, Page, Playwright, async_playwright)
|
||||
from playwright._impl._errors import TargetClosedError
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
@@ -95,15 +96,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for video and retrieve their comment information.
|
||||
if config.BILI_SEARCH_MODE == "normal":
|
||||
await self.search_by_keywords()
|
||||
elif config.BILI_SEARCH_MODE == "all_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=False)
|
||||
elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=True)
|
||||
else:
|
||||
utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||
@@ -118,6 +111,20 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
"[BilibiliCrawler.start] Bilibili Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search bilibili video
|
||||
"""
|
||||
# Search for video and retrieve their comment information.
|
||||
if config.BILI_SEARCH_MODE == "normal":
|
||||
await self.search_by_keywords()
|
||||
elif config.BILI_SEARCH_MODE == "all_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=False)
|
||||
elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=True)
|
||||
else:
|
||||
utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
|
||||
|
||||
@staticmethod
|
||||
async def get_pubtime_datetime(start: str = config.START_DAY, end: str = config.END_DAY) -> Tuple[str, str]:
|
||||
"""
|
||||
@@ -259,6 +266,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
if video_item:
|
||||
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
break
|
||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||
break
|
||||
notes_count_this_day += 1
|
||||
total_notes_crawled += 1
|
||||
video_id_list.append(video_item.get("View").get("aid"))
|
||||
@@ -305,6 +314,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
await asyncio.sleep(random.uniform(0.5, 1.5))
|
||||
await self.bili_client.get_video_all_comments(
|
||||
video_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
@@ -509,13 +519,18 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
|
||||
except TargetClosedError:
|
||||
utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user