mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 11:27:26 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibili 请求客户端
|
||||
# @Desc : bilibili request client
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
@@ -47,7 +47,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间
|
||||
timeout=60, # For media crawling, Bilibili long videos need a longer timeout
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -61,11 +61,11 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self._host = "https://api.bilibili.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy has expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -82,8 +82,8 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def pre_request_data(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
发送请求进行请求参数签名
|
||||
需要从 localStorage 拿 wbi_img_urls 这参数,值如下:
|
||||
Send request to sign request parameters
|
||||
Need to get wbi_img_urls parameter from localStorage, value as follows:
|
||||
https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png
|
||||
:param req_data:
|
||||
:return:
|
||||
@@ -95,7 +95,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_wbi_keys(self) -> Tuple[str, str]:
|
||||
"""
|
||||
获取最新的 img_key 和 sub_key
|
||||
Get the latest img_key and sub_key
|
||||
:return:
|
||||
"""
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
@@ -160,12 +160,12 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
KuaiShou web search api
|
||||
:param keyword: 搜索关键词
|
||||
:param page: 分页参数具体第几页
|
||||
:param page_size: 每一页参数的数量
|
||||
:param order: 搜索结果排序,默认位综合排序
|
||||
:param pubtime_begin_s: 发布时间开始时间戳
|
||||
:param pubtime_end_s: 发布时间结束时间戳
|
||||
:param keyword: Search keyword
|
||||
:param page: Page number for pagination
|
||||
:param page_size: Number of items per page
|
||||
:param order: Sort order for search results, default is comprehensive sorting
|
||||
:param pubtime_begin_s: Publish time start timestamp
|
||||
:param pubtime_end_s: Publish time end timestamp
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/web-interface/wbi/search/type"
|
||||
@@ -182,13 +182,13 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
|
||||
"""
|
||||
Bilibli web video detail api, aid 和 bvid任选一个参数
|
||||
:param aid: 稿件avid
|
||||
:param bvid: 稿件bvid
|
||||
Bilibli web video detail api, choose one parameter between aid and bvid
|
||||
:param aid: Video aid
|
||||
:param bvid: Video bvid
|
||||
:return:
|
||||
"""
|
||||
if not aid and not bvid:
|
||||
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
|
||||
raise ValueError("Please provide at least one parameter: aid or bvid")
|
||||
|
||||
uri = "/x/web-interface/view/detail"
|
||||
params = dict()
|
||||
@@ -201,12 +201,12 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_video_play_url(self, aid: int, cid: int) -> Dict:
|
||||
"""
|
||||
Bilibli web video play url api
|
||||
:param aid: 稿件avid
|
||||
:param aid: Video aid
|
||||
:param cid: cid
|
||||
:return:
|
||||
"""
|
||||
if not aid or not cid or aid <= 0 or cid <= 0:
|
||||
raise ValueError("aid 和 cid 必须存在")
|
||||
raise ValueError("aid and cid must exist")
|
||||
uri = "/x/player/wbi/playurl"
|
||||
qn_value = getattr(config, "BILI_QN", 80)
|
||||
params = {
|
||||
@@ -233,7 +233,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
)
|
||||
return None
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # Keep original exception type name for developer debugging
|
||||
return None
|
||||
|
||||
async def get_video_comments(
|
||||
@@ -243,9 +243,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
next: int = 0,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
:param video_id: 视频 ID
|
||||
:param order_mode: 排序方式
|
||||
:param next: 评论页选择
|
||||
:param video_id: Video ID
|
||||
:param order_mode: Sort order
|
||||
:param next: Comment page selection
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/v2/reply/wbi/main"
|
||||
@@ -266,7 +266,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
max_count: Maximum number of comments to crawl per note
|
||||
|
||||
:return:
|
||||
"""
|
||||
@@ -299,7 +299,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||
|
||||
# 检查 is_end 和 next 是否存在
|
||||
# Check if is_end and next exist
|
||||
if "is_end" not in cursor_info or "next" not in cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
|
||||
is_end = True
|
||||
@@ -317,7 +317,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
{await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)}
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
@@ -336,10 +336,10 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
get video all level two comments for a level one comment
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param video_id: Video ID
|
||||
:param level_one_comment_id: Level one comment ID
|
||||
:param order_mode:
|
||||
:param ps: 一页评论数
|
||||
:param ps: Number of comments per page
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:return:
|
||||
@@ -349,7 +349,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
while True:
|
||||
result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode)
|
||||
comment_list: List[Dict] = result.get("replies", [])
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if (int(result["page"]["count"]) <= pn * ps):
|
||||
@@ -366,9 +366,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
order_mode: CommentOrderType,
|
||||
) -> Dict:
|
||||
"""get video level two comments
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param order_mode: 排序方式
|
||||
:param video_id: Video ID
|
||||
:param level_one_comment_id: Level one comment ID
|
||||
:param order_mode: Sort order
|
||||
|
||||
:return:
|
||||
"""
|
||||
@@ -386,10 +386,10 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
"""get all videos for a creator
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 页数
|
||||
:param ps: 一页视频数
|
||||
:param order_mode: 排序方式
|
||||
:param creator_id: Creator ID
|
||||
:param pn: Page number
|
||||
:param ps: Number of videos per page
|
||||
:param order_mode: Sort order
|
||||
|
||||
:return:
|
||||
"""
|
||||
@@ -405,7 +405,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_creator_info(self, creator_id: int) -> Dict:
|
||||
"""
|
||||
get creator info
|
||||
:param creator_id: 作者 ID
|
||||
:param creator_id: Creator ID
|
||||
"""
|
||||
uri = "/x/space/wbi/acc/info"
|
||||
post_data = {
|
||||
@@ -421,9 +421,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator fans
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:param creator_id: Creator ID
|
||||
:param pn: Start page number
|
||||
:param ps: Number of items per page
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/fans"
|
||||
@@ -443,9 +443,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator followings
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:param creator_id: Creator ID
|
||||
:param pn: Start page number
|
||||
:param ps: Number of items per page
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/followings"
|
||||
@@ -460,8 +460,8 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
|
||||
"""
|
||||
get creator comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param offset: 发送请求所需参数
|
||||
:param creator_id: Creator ID
|
||||
:param offset: Parameter required for sending request
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/polymer/web-dynamic/v1/feed/space"
|
||||
@@ -485,9 +485,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大粉丝数量
|
||||
:param max_count: Maximum number of fans to crawl for a creator
|
||||
|
||||
:return: up主粉丝数列表
|
||||
:return: List of creator fans
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
@@ -499,7 +499,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
pn += 1
|
||||
if len(result) + len(fans_list) > max_count:
|
||||
fans_list = fans_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(creator_info, fans_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not fans_list:
|
||||
@@ -519,9 +519,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大关注者数量
|
||||
:param max_count: Maximum number of followings to crawl for a creator
|
||||
|
||||
:return: up主关注者列表
|
||||
:return: List of creator followings
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
@@ -533,7 +533,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
pn += 1
|
||||
if len(result) + len(followings_list) > max_count:
|
||||
followings_list = followings_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(creator_info, followings_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not followings_list:
|
||||
@@ -553,9 +553,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大动态数量
|
||||
:param max_count: Maximum number of dynamics to crawl for a creator
|
||||
|
||||
:return: up主关注者列表
|
||||
:return: List of creator dynamics
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : B站爬虫
|
||||
# @Desc : Bilibili Crawler
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
@@ -64,7 +64,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -74,9 +74,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[BilibiliCrawler] Launching browser using CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -84,7 +84,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[BilibiliCrawler] Launching browser using standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
|
||||
@@ -149,31 +149,31 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
end: str = config.END_DAY,
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
|
||||
Get bilibili publish start timestamp pubtime_begin_s and publish end timestamp pubtime_end_s
|
||||
---
|
||||
:param start: 发布日期起始时间,YYYY-MM-DD
|
||||
:param end: 发布日期结束时间,YYYY-MM-DD
|
||||
:param start: Publish date start time, YYYY-MM-DD
|
||||
:param end: Publish date end time, YYYY-MM-DD
|
||||
|
||||
Note
|
||||
---
|
||||
- 搜索的时间范围为 start 至 end,包含 start 和 end
|
||||
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
|
||||
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
|
||||
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
- Search time range is from start to end, including both start and end
|
||||
- To search content from the same day, to include search content from that day, pubtime_end_s should be pubtime_begin_s plus one day minus one second, i.e., the last second of start day
|
||||
- For example, searching only 2024-01-05 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704470399
|
||||
Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- To search content from start to end, to include search content from end day, pubtime_end_s should be pubtime_end_s plus one day minus one second, i.e., the last second of end day
|
||||
- For example, searching 2024-01-05 - 2024-01-06 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704556799
|
||||
Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
"""
|
||||
# 转换 start 与 end 为 datetime 对象
|
||||
# Convert start and end to datetime objects
|
||||
start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
|
||||
end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
|
||||
if start_day > end_day:
|
||||
raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end")
|
||||
elif start_day == end_day: # 搜索同一天的内容
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
else: # 搜索 start 至 end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||
# 将其重新转换为时间戳
|
||||
elif start_day == end_day: # Searching content from the same day
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to start_day + 1 day - 1 second
|
||||
else: # Searching from start to end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to end_day + 1 day - 1 second
|
||||
# Convert back to timestamps
|
||||
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||
|
||||
async def search_by_keywords(self):
|
||||
@@ -203,8 +203,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
||||
pubtime_end_s=0, # 作品发布日期结束日期时间戳
|
||||
pubtime_begin_s=0, # Publish date start timestamp
|
||||
pubtime_end_s=0, # Publish date end timestamp
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
@@ -508,7 +508,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's stable Chrome version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -525,7 +525,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -536,22 +536,22 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP mode launch failed, fallback to standard mode: {e}")
|
||||
# Fallback to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# If using CDP mode, special handling is required
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -27,28 +27,28 @@ from enum import Enum
|
||||
|
||||
|
||||
class SearchOrderType(Enum):
|
||||
# 综合排序
|
||||
# Comprehensive sorting
|
||||
DEFAULT = ""
|
||||
|
||||
# 最多点击
|
||||
# Most clicks
|
||||
MOST_CLICK = "click"
|
||||
|
||||
# 最新发布
|
||||
# Latest published
|
||||
LAST_PUBLISH = "pubdate"
|
||||
|
||||
# 最多弹幕
|
||||
# Most danmu (comments)
|
||||
MOST_DANMU = "dm"
|
||||
|
||||
# 最多收藏
|
||||
# Most bookmarks
|
||||
MOST_MARK = "stow"
|
||||
|
||||
|
||||
class CommentOrderType(Enum):
|
||||
# 仅按热度
|
||||
# By popularity only
|
||||
DEFAULT = 0
|
||||
|
||||
# 按热度+按时间
|
||||
# By popularity + time
|
||||
MIXED = 1
|
||||
|
||||
# 按时间
|
||||
# By time
|
||||
TIME = 2
|
||||
|
||||
@@ -21,8 +21,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 23:26
|
||||
# @Desc : bilibili 请求参数签名
|
||||
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
# @Desc : bilibili request parameter signing
|
||||
# Reverse engineering implementation reference: https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
import re
|
||||
import urllib.parse
|
||||
from hashlib import md5
|
||||
@@ -45,7 +45,7 @@ class BilibiliSign:
|
||||
|
||||
def get_salt(self) -> str:
|
||||
"""
|
||||
获取加盐的 key
|
||||
Get the salted key
|
||||
:return:
|
||||
"""
|
||||
salt = ""
|
||||
@@ -56,8 +56,8 @@ class BilibiliSign:
|
||||
|
||||
def sign(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
请求参数中加上当前时间戳对请求参数中的key进行字典序排序
|
||||
再将请求参数进行 url 编码集合 salt 进行 md5 就可以生成w_rid参数了
|
||||
Add current timestamp to request parameters, sort keys in dictionary order,
|
||||
then URL encode the parameters and combine with salt to generate md5 for w_rid parameter
|
||||
:param req_data:
|
||||
:return:
|
||||
"""
|
||||
@@ -65,35 +65,35 @@ class BilibiliSign:
|
||||
req_data.update({"wts": current_ts})
|
||||
req_data = dict(sorted(req_data.items()))
|
||||
req_data = {
|
||||
# 过滤 value 中的 "!'()*" 字符
|
||||
# Filter "!'()*" characters from values
|
||||
k: ''.join(filter(lambda ch: ch not in "!'()*", str(v)))
|
||||
for k, v
|
||||
in req_data.items()
|
||||
}
|
||||
query = urllib.parse.urlencode(req_data)
|
||||
salt = self.get_salt()
|
||||
wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid
|
||||
wbi_sign = md5((query + salt).encode()).hexdigest() # Calculate w_rid
|
||||
req_data['w_rid'] = wbi_sign
|
||||
return req_data
|
||||
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从B站视频URL中解析出视频ID
|
||||
Parse video ID from Bilibili video URL
|
||||
Args:
|
||||
url: B站视频链接
|
||||
url: Bilibili video link
|
||||
- https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
|
||||
- https://www.bilibili.com/video/BV1d54y1g7db
|
||||
- BV1d54y1g7db (直接传入BV号)
|
||||
- BV1d54y1g7db (directly pass BV number)
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
VideoUrlInfo: Object containing video ID
|
||||
"""
|
||||
# 如果传入的已经是BV号,直接返回
|
||||
# If the input is already a BV number, return directly
|
||||
if url.startswith("BV"):
|
||||
return VideoUrlInfo(video_id=url)
|
||||
|
||||
# 使用正则表达式提取BV号
|
||||
# 匹配 /video/BV... 或 /video/av... 格式
|
||||
# Use regex to extract BV number
|
||||
# Match /video/BV... or /video/av... format
|
||||
bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
|
||||
match = re.search(bv_pattern, url)
|
||||
|
||||
@@ -101,26 +101,26 @@ def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
video_id = match.group(1)
|
||||
return VideoUrlInfo(video_id=video_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
raise ValueError(f"Unable to parse video ID from URL: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从B站创作者空间URL中解析出创作者ID
|
||||
Parse creator ID from Bilibili creator space URL
|
||||
Args:
|
||||
url: B站创作者空间链接
|
||||
url: Bilibili creator space link
|
||||
- https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
|
||||
- https://space.bilibili.com/20813884
|
||||
- 434377496 (直接传入UID)
|
||||
- 434377496 (directly pass UID)
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
CreatorUrlInfo: Object containing creator ID
|
||||
"""
|
||||
# 如果传入的已经是纯数字ID,直接返回
|
||||
# If the input is already a numeric ID, return directly
|
||||
if url.isdigit():
|
||||
return CreatorUrlInfo(creator_id=url)
|
||||
|
||||
# 使用正则表达式提取UID
|
||||
# 匹配 /space.bilibili.com/数字 格式
|
||||
# Use regex to extract UID
|
||||
# Match /space.bilibili.com/number format
|
||||
uid_pattern = r'space\.bilibili\.com/(\d+)'
|
||||
match = re.search(uid_pattern, url)
|
||||
|
||||
@@ -128,20 +128,20 @@ def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
creator_id = match.group(1)
|
||||
return CreatorUrlInfo(creator_id=creator_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
raise ValueError(f"Unable to parse creator ID from URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试视频URL解析
|
||||
# Test video URL parsing
|
||||
video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
|
||||
video_url2 = "BV1d54y1g7db"
|
||||
print("视频URL解析测试:")
|
||||
print("Video URL parsing test:")
|
||||
print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
|
||||
print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
|
||||
|
||||
# 测试创作者URL解析
|
||||
# Test creator URL parsing
|
||||
creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
|
||||
creator_url2 = "20813884"
|
||||
print("\n创作者URL解析测试:")
|
||||
print("\nCreator URL parsing test:")
|
||||
print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
|
||||
print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibli登录实现类
|
||||
# @Desc : bilibili login implementation class
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
|
||||
Reference in New Issue
Block a user