fix(media_platform): handle edge cases and improve error handling for Bilibili client and crawler

- BilibiliClient:
  - Improve wbi_img_urls handling for better compatibility
  - Add error handling for missing or invalid 'is_end' and 'next' in comment cursor

- BilibiliCrawler:
  - Fix daily limit logic for keyword-based searches
  - Improve logging and break conditions for max notes count limits
  - Ensure proper tracking of total notes crawled for each keyword
This commit is contained in:
gaoxiaobei
2025-07-17 06:40:56 +08:00
parent 4d743f6c17
commit 9fb396c7d1
2 changed files with 31 additions and 11 deletions

View File

@@ -15,6 +15,7 @@
# @Desc : bilibili 请求客户端 # @Desc : bilibili 请求客户端
import asyncio import asyncio
import json import json
import random
from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode from urllib.parse import urlencode
@@ -82,8 +83,12 @@ class BilibiliClient(AbstractApiClient):
:return: :return:
""" """
local_storage = await self.playwright_page.evaluate("() => window.localStorage") local_storage = await self.playwright_page.evaluate("() => window.localStorage")
wbi_img_urls = local_storage.get("wbi_img_urls", "") or local_storage.get( wbi_img_urls = local_storage.get("wbi_img_urls", "")
"wbi_img_url") + "-" + local_storage.get("wbi_sub_url") if not wbi_img_urls:
img_url_from_storage = local_storage.get("wbi_img_url")
sub_url_from_storage = local_storage.get("wbi_sub_url")
if img_url_from_storage and sub_url_from_storage:
wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
if wbi_img_urls and "-" in wbi_img_urls: if wbi_img_urls and "-" in wbi_img_urls:
img_url, sub_url = wbi_img_urls.split("-") img_url, sub_url = wbi_img_urls.split("-")
else: else:
@@ -271,8 +276,18 @@ class BilibiliClient(AbstractApiClient):
break break
comment_list: List[Dict] = comments_res.get("replies", []) comment_list: List[Dict] = comments_res.get("replies", [])
is_end = cursor_info.get("is_end")
next_page = cursor_info.get("next") # 检查 is_end 和 next 是否存在
if "is_end" not in cursor_info or "next" not in cursor_info:
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
is_end = True
else:
is_end = cursor_info.get("is_end")
next_page = cursor_info.get("next")
if not isinstance(is_end, bool):
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
is_end = True
if is_fetch_sub_comments: if is_fetch_sub_comments:
for comment in comment_list: for comment in comment_list:
comment_id = comment['rpid'] comment_id = comment['rpid']

View File

@@ -219,10 +219,14 @@ class BilibiliCrawler(AbstractCrawler):
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword) source_keyword_var.set(keyword)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}") utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
total_notes_crawled = 0 total_notes_crawled_for_keyword = 0
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'): for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT: if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.") utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
break break
@@ -234,11 +238,10 @@ class BilibiliCrawler(AbstractCrawler):
if notes_count_this_day >= config.MAX_NOTES_PER_DAY: if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.") utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
break break
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT: if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.") utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
break break
if not daily_limit and (page - start_page + 1) * bili_limit_count > config.CRAWLER_MAX_NOTES_COUNT: if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
# For non-daily-limit mode, we still respect the total count in a loose way per day.
break break
try: try:
@@ -264,12 +267,14 @@ class BilibiliCrawler(AbstractCrawler):
for video_item in video_items: for video_item in video_items:
if video_item: if video_item:
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT: if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
break break
if notes_count_this_day >= config.MAX_NOTES_PER_DAY: if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
break break
notes_count_this_day += 1 notes_count_this_day += 1
total_notes_crawled += 1 total_notes_crawled_for_keyword += 1
video_id_list.append(video_item.get("View").get("aid")) video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item) await bilibili_store.update_bilibili_video(video_item)
await bilibili_store.update_up_info(video_item) await bilibili_store.update_up_info(video_item)