From e91ec750bbd1c30b565e0141c86965e691145333 Mon Sep 17 00:00:00 2001
From: gaoxiaobei <99178334+gaoxiaobei@users.noreply.github.com>
Date: Sun, 13 Jul 2025 10:42:15 +0800
Subject: [PATCH] feat: Enhance Bilibili crawler with retry logic and
 robustness

This commit introduces several improvements to enhance the stability and functionality of the Bilibili crawler.

- **Add Retry Logic:** Implement a retry mechanism with exponential backoff when fetching video comments. This makes the crawler more resilient to transient network issues or API errors.
- **Improve Error Handling:** Add a `try...except` block to handle potential `JSONDecodeError` in the Bilibili client, preventing crashes when the API returns an invalid response.
- **Ensure Clean Shutdown:** Refactor `main.py` to use a `try...finally` block, guaranteeing that the crawler and database connections are properly closed on exit, error, or `KeyboardInterrupt`.
- **Update Default Config:** Adjust default configuration values to increase concurrency, enable word cloud generation by default, and refine the Bilibili search mode for more practical usage.
---
 config/base_config.py             | 10 +++----
 main.py                           | 28 ++++++++++--------
 media_platform/bilibili/client.py | 34 ++++++++++++++++++++--
 media_platform/bilibili/core.py   | 47 ++++++++++++++++++++-----------
 4 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/config/base_config.py b/config/base_config.py
index 26aeaa7..c3acde8 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -90,7 +90,7 @@ CRAWLER_MAX_NOTES_COUNT = 200
 MAX_NOTES_PER_DAY = 1
 
 # 并发爬虫数量控制
-MAX_CONCURRENCY_NUM = 1
+MAX_CONCURRENCY_NUM = 5
 
 # 是否开启爬图片模式, 默认不开启爬图片
 ENABLE_GET_IMAGES = False
@@ -99,7 +99,7 @@ ENABLE_GET_IMAGES = False
 ENABLE_GET_COMMENTS = True
 
 # 爬取一级评论的数量控制(单视频/帖子)
-CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 1
+CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 6
 
 # 是否开启爬二级评论模式, 默认不开启爬二级评论
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
@@ -202,7 +202,7 @@ ZHIHU_SPECIFIED_ID_LIST = [
 
 # 词云相关
 # 是否开启生成评论词云图
-ENABLE_GET_WORDCLOUD = False
+ENABLE_GET_WORDCLOUD = True
 # 自定义词语及其分组
 # 添加规则：xx:yy 其中xx为自定义添加的词组，yy为将xx该词组分到的组名。
 CUSTOM_WORDS = {
@@ -220,13 +220,13 @@ FONT_PATH = "./docs/STZHONGS.TTF"
 START_DAY = "2024-01-01"
 
 # 爬取结束的天数，仅支持 bilibili 关键字搜索，YYYY-MM-DD 格式
-END_DAY = "2024-01-01"
+END_DAY = "2025-07-12"
 
 # Bilibili 搜索模式，仅在 CRAWLER_TYPE="search" 时生效
 # "normal": 不指定时间范围进行搜索，最多返回约1000条结果。
 # "all_in_time_range": 在 START_DAY 和 END_DAY 指定的时间范围内，尽可能多地爬取数据，每日上限受 MAX_NOTES_PER_DAY 影响，但总数可能超过 CRAWLER_MAX_NOTES_COUNT。
 # "daily_limit_in_time_range": 在指定时间范围内，严格遵守 MAX_NOTES_PER_DAY 的每日上限和 CRAWLER_MAX_NOTES_COUNT 的总上限。
-BILI_SEARCH_MODE = "all_in_time_range"
+BILI_SEARCH_MODE = "daily_limit_in_time_range"
 
 #!!! 下面仅支持 bilibili creator搜索
 # 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
diff --git a/main.py b/main.py
index 7292701..1dc5f71 100644
--- a/main.py
+++ b/main.py
@@ -45,25 +45,29 @@ class CrawlerFactory:
         return crawler_class()
 
 async def main():
+    # Init crawler
+    crawler: Optional[AbstractCrawler] = None
+    try:
+        # parse cmd
+        await cmd_arg.parse_cmd()
 
-    # parse cmd
-    await cmd_arg.parse_cmd()
+        # init db
+        if config.SAVE_DATA_OPTION == "db":
+            await db.init_db()
 
-    # init db
-    if config.SAVE_DATA_OPTION == "db":
-        await db.init_db()
+        crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
+        await crawler.start()
+    finally:
+        if crawler:
+            await crawler.close()
+        if config.SAVE_DATA_OPTION == "db":
+            await db.close()
 
-    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
-    await crawler.start()
-
-    if config.SAVE_DATA_OPTION == "db":
-        await db.close()
-
-    
 
 if __name__ == '__main__':
     try:
         # asyncio.run(main())
         asyncio.get_event_loop().run_until_complete(main())
     except KeyboardInterrupt:
+        print("\n[main] Caught keyboard interrupt, exiting.")
         sys.exit()
diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py
index 32af357..9893f31 100644
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@@ -53,7 +53,11 @@ class BilibiliClient(AbstractApiClient):
                 method, url, timeout=self.timeout,
                 **kwargs
             )
-        data: Dict = response.json()
+        try:
+            data: Dict = response.json()
+        except json.JSONDecodeError:
+            utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}")
+            raise DataFetchError(f"Failed to decode JSON, content: {response.text}")
         if data.get("code") != 0:
             raise DataFetchError(data.get("message", "unkonw error"))
         else:
@@ -235,13 +239,37 @@ class BilibiliClient(AbstractApiClient):
 
         :return:
         """
-
         result = []
         is_end = False
         next_page = 0
+        max_retries = 3
         while not is_end and len(result) < max_count:
-            comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
+            comments_res = None
+            for attempt in range(max_retries):
+                try:
+                    comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
+                    break  # Success
+                except DataFetchError as e:
+                    if attempt < max_retries - 1:
+                        delay = 5 * (2 ** attempt) + random.uniform(0, 1)
+                        utils.logger.warning(
+                            f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})"
+                        )
+                        await asyncio.sleep(delay)
+                    else:
+                        utils.logger.error(
+                            f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}"
+                        )
+                        is_end = True
+                        break
+            if not comments_res:
+                break
+
             cursor_info: Dict = comments_res.get("cursor")
+            if not cursor_info:
+                utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.")
+                break
+
             comment_list: List[Dict] = comments_res.get("replies", [])
             is_end = cursor_info.get("is_end")
             next_page = cursor_info.get("next")
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
index 5fd3e9b..a8c89cb 100644
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -23,6 +23,7 @@ from datetime import datetime, timedelta
 import pandas as pd
 
 from playwright.async_api import (BrowserContext, BrowserType, Page, Playwright, async_playwright)
+from playwright._impl._errors import TargetClosedError
 
 import config
 from base.base_crawler import AbstractCrawler
@@ -95,15 +96,7 @@ class BilibiliCrawler(AbstractCrawler):
 
             crawler_type_var.set(config.CRAWLER_TYPE)
             if config.CRAWLER_TYPE == "search":
-                # Search for video and retrieve their comment information.
-                if config.BILI_SEARCH_MODE == "normal":
-                    await self.search_by_keywords()
-                elif config.BILI_SEARCH_MODE == "all_in_time_range":
-                    await self.search_by_keywords_in_time_range(daily_limit=False)
-                elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
-                    await self.search_by_keywords_in_time_range(daily_limit=True)
-                else:
-                    utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
+                await self.search()
             elif config.CRAWLER_TYPE == "detail":
                 # Get the information and comments of the specified post
                 await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
@@ -118,6 +111,20 @@ class BilibiliCrawler(AbstractCrawler):
             utils.logger.info(
                 "[BilibiliCrawler.start] Bilibili Crawler finished ...")
 
+    async def search(self):
+        """
+        search bilibili video
+        """
+        # Search for video and retrieve their comment information.
+        if config.BILI_SEARCH_MODE == "normal":
+            await self.search_by_keywords()
+        elif config.BILI_SEARCH_MODE == "all_in_time_range":
+            await self.search_by_keywords_in_time_range(daily_limit=False)
+        elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
+            await self.search_by_keywords_in_time_range(daily_limit=True)
+        else:
+            utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
+
     @staticmethod
     async def get_pubtime_datetime(start: str = config.START_DAY, end: str = config.END_DAY) -> Tuple[str, str]:
         """
@@ -259,6 +266,8 @@ class BilibiliCrawler(AbstractCrawler):
                             if video_item:
                                 if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
                                     break
+                                if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
+                                    break
                                 notes_count_this_day += 1
                                 total_notes_crawled += 1
                                 video_id_list.append(video_item.get("View").get("aid"))
@@ -305,6 +314,7 @@ class BilibiliCrawler(AbstractCrawler):
             try:
                 utils.logger.info(
                     f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
+                await asyncio.sleep(random.uniform(0.5, 1.5))
                 await self.bili_client.get_video_all_comments(
                     video_id=video_id,
                     crawl_interval=random.random(),
@@ -509,13 +519,18 @@ class BilibiliCrawler(AbstractCrawler):
 
     async def close(self):
         """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
+        except TargetClosedError:
+            utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.")
+        except Exception as e:
+            utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}")
 
     async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
         """