fix(media_platform): handle edge cases and improve error handling for Bilibili client and crawler

- BilibiliClient: - Improve wbi_img_urls handling for better compatibility - Add error handling for missing or invalid 'is_end' and 'next' in comment cursor - BilibiliCrawler: - Fix daily limit logic for keyword-based searches - Improve logging and break conditions for max notes count limits - Ensure proper tracking of total notes crawled for each keyword
2026-06-09 11:27:26 +08:00 · 2025-07-17 06:40:56 +08:00
parent 4d743f6c17
commit 9fb396c7d1
2 changed files with 31 additions and 11 deletions
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@@ -15,6 +15,7 @@
 # @Desc    : bilibili 请求客户端
 import asyncio
 import json
 import random
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlencode
@@ -82,8 +83,12 @@ class BilibiliClient(AbstractApiClient):
        :return:
        """
        local_storage = await self.playwright_page.evaluate("() => window.localStorage")
-        wbi_img_urls = local_storage.get("wbi_img_urls", "") or local_storage.get(
+        wbi_img_urls = local_storage.get("wbi_img_urls", "")
-            "wbi_img_url") + "-" + local_storage.get("wbi_sub_url")
+        if not wbi_img_urls:
            img_url_from_storage = local_storage.get("wbi_img_url")
            sub_url_from_storage = local_storage.get("wbi_sub_url")
            if img_url_from_storage and sub_url_from_storage:
                wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
        if wbi_img_urls and "-" in wbi_img_urls:
            img_url, sub_url = wbi_img_urls.split("-")
        else:
@@ -271,8 +276,18 @@ class BilibiliClient(AbstractApiClient):
                break
            comment_list: List[Dict] = comments_res.get("replies", [])
-            is_end = cursor_info.get("is_end")
+            
-            next_page = cursor_info.get("next")
+            # 检查 is_end 和 next 是否存在
            if "is_end" not in cursor_info or "next" not in cursor_info:
                utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
                is_end = True
            else:
                is_end = cursor_info.get("is_end")
                next_page = cursor_info.get("next")
            if not isinstance(is_end, bool):
                utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
                is_end = True
            if is_fetch_sub_comments:
                for comment in comment_list:
                    comment_id = comment['rpid']
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -219,10 +219,14 @@ class BilibiliCrawler(AbstractCrawler):
        for keyword in config.KEYWORDS.split(","):
            source_keyword_var.set(keyword)
            utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
-            total_notes_crawled = 0
+            total_notes_crawled_for_keyword = 0
            for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
-                if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
+                if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
                    utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
                    break
                if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
                    utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
                    break
@@ -234,11 +238,10 @@ class BilibiliCrawler(AbstractCrawler):
                    if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
                        utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
                        break
-                    if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
+                    if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
                        utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
                        break
-                    if not daily_limit and (page - start_page + 1) * bili_limit_count > config.CRAWLER_MAX_NOTES_COUNT:
+                    if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
                        # For non-daily-limit mode, we still respect the total count in a loose way per day.
                        break
                    try:
@@ -264,12 +267,14 @@ class BilibiliCrawler(AbstractCrawler):
                        for video_item in video_items:
                            if video_item:
-                                if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
+                                if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
                                    break
                                if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
                                    break
                                if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
                                    break
                                notes_count_this_day += 1
-                                total_notes_crawled += 1
+                                total_notes_crawled_for_keyword += 1
                                video_id_list.append(video_item.get("View").get("aid"))
                                await bilibili_store.update_bilibili_video(video_item)
                                await bilibili_store.update_up_info(video_item)