feat: Enhance Bilibili crawler with retry logic and robustness

This commit introduces several improvements to enhance the stability and functionality of the Bilibili crawler.

- **Add Retry Logic:** Implement a retry mechanism with exponential backoff when fetching video comments. This makes the crawler more resilient to transient network issues or API errors.
- **Improve Error Handling:** Add a `try...except` block to handle potential `JSONDecodeError` in the Bilibili client, preventing crashes when the API returns an invalid response.
- **Ensure Clean Shutdown:** Refactor `main.py` to use a `try...finally` block, guaranteeing that the crawler and database connections are properly closed on exit, error, or `KeyboardInterrupt`.
- **Update Default Config:** Adjust default configuration values to increase concurrency, enable word cloud generation by default, and refine the Bilibili search mode for more practical usage.
This commit is contained in:
gaoxiaobei
2025-07-13 10:42:15 +08:00
parent d0d7293926
commit e91ec750bb
4 changed files with 83 additions and 36 deletions

View File

@@ -90,7 +90,7 @@ CRAWLER_MAX_NOTES_COUNT = 200
MAX_NOTES_PER_DAY = 1
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1
MAX_CONCURRENCY_NUM = 5
# 是否开启爬图片模式, 默认不开启爬图片
ENABLE_GET_IMAGES = False
@@ -99,7 +99,7 @@ ENABLE_GET_IMAGES = False
ENABLE_GET_COMMENTS = True
# 爬取一级评论的数量控制(单视频/帖子)
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 1
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 6
# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
@@ -202,7 +202,7 @@ ZHIHU_SPECIFIED_ID_LIST = [
# 词云相关
# 是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False
ENABLE_GET_WORDCLOUD = True
# 自定义词语及其分组
# 添加规则xx:yy 其中xx为自定义添加的词组yy为将xx该词组分到的组名。
CUSTOM_WORDS = {
@@ -220,13 +220,13 @@ FONT_PATH = "./docs/STZHONGS.TTF"
START_DAY = "2024-01-01"
# 爬取结束的天数,仅支持 bilibili 关键字搜索YYYY-MM-DD 格式
END_DAY = "2024-01-01"
END_DAY = "2025-07-12"
# Bilibili 搜索模式,仅在 CRAWLER_TYPE="search" 时生效
# "normal": 不指定时间范围进行搜索最多返回约1000条结果。
# "all_in_time_range": 在 START_DAY 和 END_DAY 指定的时间范围内,尽可能多地爬取数据,每日上限受 MAX_NOTES_PER_DAY 影响,但总数可能超过 CRAWLER_MAX_NOTES_COUNT。
# "daily_limit_in_time_range": 在指定时间范围内,严格遵守 MAX_NOTES_PER_DAY 的每日上限和 CRAWLER_MAX_NOTES_COUNT 的总上限。
BILI_SEARCH_MODE = "all_in_time_range"
BILI_SEARCH_MODE = "daily_limit_in_time_range"
#!!! 下面仅支持 bilibili creator搜索
# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)