feat: Enhance Bilibili crawler with retry logic and robustness

This commit introduces several improvements to enhance the stability and functionality of the Bilibili crawler.

- **Add Retry Logic:** Implement a retry mechanism with exponential backoff when fetching video comments. This makes the crawler more resilient to transient network issues or API errors.
- **Improve Error Handling:** Add a `try...except` block to handle potential `JSONDecodeError` in the Bilibili client, preventing crashes when the API returns an invalid response.
- **Ensure Clean Shutdown:** Refactor `main.py` to use a `try...finally` block, guaranteeing that the crawler and database connections are properly closed on exit, error, or `KeyboardInterrupt`.
- **Update Default Config:** Adjust default configuration values to increase concurrency, enable word cloud generation by default, and refine the Bilibili search mode for more practical usage.
This commit is contained in:
gaoxiaobei
2025-07-13 10:42:15 +08:00
parent d0d7293926
commit e91ec750bb
4 changed files with 83 additions and 36 deletions

View File

@@ -53,7 +53,11 @@ class BilibiliClient(AbstractApiClient):
method, url, timeout=self.timeout,
**kwargs
)
data: Dict = response.json()
try:
data: Dict = response.json()
except json.JSONDecodeError:
utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}")
raise DataFetchError(f"Failed to decode JSON, content: {response.text}")
if data.get("code") != 0:
raise DataFetchError(data.get("message", "unkonw error"))
else:
@@ -235,13 +239,37 @@ class BilibiliClient(AbstractApiClient):
:return:
"""
result = []
is_end = False
next_page = 0
max_retries = 3
while not is_end and len(result) < max_count:
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
comments_res = None
for attempt in range(max_retries):
try:
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
break # Success
except DataFetchError as e:
if attempt < max_retries - 1:
delay = 5 * (2 ** attempt) + random.uniform(0, 1)
utils.logger.warning(
f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})"
)
await asyncio.sleep(delay)
else:
utils.logger.error(
f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}"
)
is_end = True
break
if not comments_res:
break
cursor_info: Dict = comments_res.get("cursor")
if not cursor_info:
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.")
break
comment_list: List[Dict] = comments_res.get("replies", [])
is_end = cursor_info.get("is_end")
next_page = cursor_info.get("next")