mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-04-21 19:27:40 +08:00
fix(media_platform): handle edge cases and improve error handling for Bilibili client and crawler
- BilibiliClient: - Improve wbi_img_urls handling for better compatibility - Add error handling for missing or invalid 'is_end' and 'next' in comment cursor - BilibiliCrawler: - Fix daily limit logic for keyword-based searches - Improve logging and break conditions for max notes count limits - Ensure proper tracking of total notes crawled for each keyword
This commit is contained in:
@@ -15,6 +15,7 @@
|
|||||||
# @Desc : bilibili 请求客户端
|
# @Desc : bilibili 请求客户端
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import random
|
||||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
@@ -82,8 +83,12 @@ class BilibiliClient(AbstractApiClient):
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||||
wbi_img_urls = local_storage.get("wbi_img_urls", "") or local_storage.get(
|
wbi_img_urls = local_storage.get("wbi_img_urls", "")
|
||||||
"wbi_img_url") + "-" + local_storage.get("wbi_sub_url")
|
if not wbi_img_urls:
|
||||||
|
img_url_from_storage = local_storage.get("wbi_img_url")
|
||||||
|
sub_url_from_storage = local_storage.get("wbi_sub_url")
|
||||||
|
if img_url_from_storage and sub_url_from_storage:
|
||||||
|
wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
|
||||||
if wbi_img_urls and "-" in wbi_img_urls:
|
if wbi_img_urls and "-" in wbi_img_urls:
|
||||||
img_url, sub_url = wbi_img_urls.split("-")
|
img_url, sub_url = wbi_img_urls.split("-")
|
||||||
else:
|
else:
|
||||||
@@ -271,8 +276,18 @@ class BilibiliClient(AbstractApiClient):
|
|||||||
break
|
break
|
||||||
|
|
||||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||||
is_end = cursor_info.get("is_end")
|
|
||||||
next_page = cursor_info.get("next")
|
# 检查 is_end 和 next 是否存在
|
||||||
|
if "is_end" not in cursor_info or "next" not in cursor_info:
|
||||||
|
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
|
||||||
|
is_end = True
|
||||||
|
else:
|
||||||
|
is_end = cursor_info.get("is_end")
|
||||||
|
next_page = cursor_info.get("next")
|
||||||
|
|
||||||
|
if not isinstance(is_end, bool):
|
||||||
|
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
|
||||||
|
is_end = True
|
||||||
if is_fetch_sub_comments:
|
if is_fetch_sub_comments:
|
||||||
for comment in comment_list:
|
for comment in comment_list:
|
||||||
comment_id = comment['rpid']
|
comment_id = comment['rpid']
|
||||||
|
|||||||
@@ -219,10 +219,14 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
source_keyword_var.set(keyword)
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
|
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
|
||||||
total_notes_crawled = 0
|
total_notes_crawled_for_keyword = 0
|
||||||
|
|
||||||
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
|
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
|
||||||
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
|
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -234,11 +238,10 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||||
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
|
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
|
||||||
break
|
break
|
||||||
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
|
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
|
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
|
||||||
break
|
break
|
||||||
if not daily_limit and (page - start_page + 1) * bili_limit_count > config.CRAWLER_MAX_NOTES_COUNT:
|
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
# For non-daily-limit mode, we still respect the total count in a loose way per day.
|
|
||||||
break
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -264,12 +267,14 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
for video_item in video_items:
|
for video_item in video_items:
|
||||||
if video_item:
|
if video_item:
|
||||||
if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
|
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
break
|
||||||
|
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
break
|
break
|
||||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||||
break
|
break
|
||||||
notes_count_this_day += 1
|
notes_count_this_day += 1
|
||||||
total_notes_crawled += 1
|
total_notes_crawled_for_keyword += 1
|
||||||
video_id_list.append(video_item.get("View").get("aid"))
|
video_id_list.append(video_item.get("View").get("aid"))
|
||||||
await bilibili_store.update_bilibili_video(video_item)
|
await bilibili_store.update_bilibili_video(video_item)
|
||||||
await bilibili_store.update_up_info(video_item)
|
await bilibili_store.update_up_info(video_item)
|
||||||
|
|||||||
Reference in New Issue
Block a user