mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-08 02:47:26 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : B站爬虫
|
||||
# @Desc : Bilibili Crawler
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
@@ -64,7 +64,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -74,9 +74,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[BilibiliCrawler] Launching browser using CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -84,7 +84,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[BilibiliCrawler] Launching browser using standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
|
||||
@@ -149,31 +149,31 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
end: str = config.END_DAY,
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
|
||||
Get bilibili publish start timestamp pubtime_begin_s and publish end timestamp pubtime_end_s
|
||||
---
|
||||
:param start: 发布日期起始时间,YYYY-MM-DD
|
||||
:param end: 发布日期结束时间,YYYY-MM-DD
|
||||
:param start: Publish date start time, YYYY-MM-DD
|
||||
:param end: Publish date end time, YYYY-MM-DD
|
||||
|
||||
Note
|
||||
---
|
||||
- 搜索的时间范围为 start 至 end,包含 start 和 end
|
||||
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
|
||||
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
|
||||
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
- Search time range is from start to end, including both start and end
|
||||
- To search content from the same day, to include search content from that day, pubtime_end_s should be pubtime_begin_s plus one day minus one second, i.e., the last second of start day
|
||||
- For example, searching only 2024-01-05 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704470399
|
||||
Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- To search content from start to end, to include search content from end day, pubtime_end_s should be pubtime_end_s plus one day minus one second, i.e., the last second of end day
|
||||
- For example, searching 2024-01-05 - 2024-01-06 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704556799
|
||||
Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
"""
|
||||
# 转换 start 与 end 为 datetime 对象
|
||||
# Convert start and end to datetime objects
|
||||
start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
|
||||
end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
|
||||
if start_day > end_day:
|
||||
raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end")
|
||||
elif start_day == end_day: # 搜索同一天的内容
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
else: # 搜索 start 至 end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||
# 将其重新转换为时间戳
|
||||
elif start_day == end_day: # Searching content from the same day
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to start_day + 1 day - 1 second
|
||||
else: # Searching from start to end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to end_day + 1 day - 1 second
|
||||
# Convert back to timestamps
|
||||
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||
|
||||
async def search_by_keywords(self):
|
||||
@@ -203,8 +203,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
||||
pubtime_end_s=0, # 作品发布日期结束日期时间戳
|
||||
pubtime_begin_s=0, # Publish date start timestamp
|
||||
pubtime_end_s=0, # Publish date end timestamp
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
@@ -508,7 +508,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's stable Chrome version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -525,7 +525,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -536,22 +536,22 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP mode launch failed, fallback to standard mode: {e}")
|
||||
# Fallback to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# If using CDP mode, special handling is required
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
Reference in New Issue
Block a user