fix(media_platform): handle edge cases and improve error handling for Bilibili client and crawler

- BilibiliClient:
  - Improve wbi_img_urls handling for better compatibility
  - Add error handling for missing or invalid 'is_end' and 'next' in comment cursor

- BilibiliCrawler:
  - Fix daily limit logic for keyword-based searches
  - Improve logging and break conditions for max notes count limits
  - Ensure proper tracking of total notes crawled for each keyword
This commit is contained in:
gaoxiaobei
2025-07-17 06:40:56 +08:00
parent 4d743f6c17
commit 9fb396c7d1
2 changed files with 31 additions and 11 deletions

View File

@@ -15,6 +15,7 @@
# @Desc : bilibili 请求客户端
import asyncio
import json
import random
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode
@@ -82,8 +83,12 @@ class BilibiliClient(AbstractApiClient):
:return:
"""
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
wbi_img_urls = local_storage.get("wbi_img_urls", "") or local_storage.get(
"wbi_img_url") + "-" + local_storage.get("wbi_sub_url")
wbi_img_urls = local_storage.get("wbi_img_urls", "")
if not wbi_img_urls:
img_url_from_storage = local_storage.get("wbi_img_url")
sub_url_from_storage = local_storage.get("wbi_sub_url")
if img_url_from_storage and sub_url_from_storage:
wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
if wbi_img_urls and "-" in wbi_img_urls:
img_url, sub_url = wbi_img_urls.split("-")
else:
@@ -271,8 +276,18 @@ class BilibiliClient(AbstractApiClient):
break
comment_list: List[Dict] = comments_res.get("replies", [])
is_end = cursor_info.get("is_end")
next_page = cursor_info.get("next")
# 检查 is_end 和 next 是否存在
if "is_end" not in cursor_info or "next" not in cursor_info:
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
is_end = True
else:
is_end = cursor_info.get("is_end")
next_page = cursor_info.get("next")
if not isinstance(is_end, bool):
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
is_end = True
if is_fetch_sub_comments:
for comment in comment_list:
comment_id = comment['rpid']

View File

@@ -219,10 +219,14 @@ class BilibiliCrawler(AbstractCrawler):
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
total_notes_crawled = 0
total_notes_crawled_for_keyword = 0
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
break
@@ -234,11 +238,10 @@ class BilibiliCrawler(AbstractCrawler):
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
break
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
break
if not daily_limit and (page - start_page + 1) * bili_limit_count > config.CRAWLER_MAX_NOTES_COUNT:
# For non-daily-limit mode, we still respect the total count in a loose way per day.
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
break
try:
@@ -264,12 +267,14 @@ class BilibiliCrawler(AbstractCrawler):
for video_item in video_items:
if video_item:
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
break
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
break
notes_count_this_day += 1
total_notes_crawled += 1
total_notes_crawled_for_keyword += 1
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
await bilibili_store.update_up_info(video_item)