mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-07 02:17:25 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -60,14 +60,14 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.default_headers = headers
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = ZhihuExtractor()
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def _pre_headers(self, url: str) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
Sign request headers
|
||||
Args:
|
||||
url: 请求的URL需要包含请求的参数
|
||||
url: Request URL with query parameters
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -83,16 +83,16 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Wrapper for httpx common request method with response handling
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
method: Request method
|
||||
url: Request URL
|
||||
**kwargs: Other request parameters such as headers, body, etc.
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
# return response.text
|
||||
@@ -105,7 +105,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}")
|
||||
if response.status_code == 403:
|
||||
raise ForbiddenError(response.text)
|
||||
elif response.status_code == 404: # 如果一个content没有评论也是404
|
||||
elif response.status_code == 404: # Content without comments also returns 404
|
||||
return {}
|
||||
|
||||
raise DataFetchError(response.text)
|
||||
@@ -124,10 +124,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
GET request with header signing
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
uri: Request URI
|
||||
params: Request parameters
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -141,7 +141,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Check if login status is still valid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -161,9 +161,9 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Update cookies method provided by API client, typically called after successful login
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
browser_context: Browser context object
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -174,7 +174,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_current_user_info(self) -> Dict:
|
||||
"""
|
||||
获取当前登录用户信息
|
||||
Get current logged-in user information
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -191,14 +191,14 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
search_time: SearchTime = SearchTime.DEFAULT,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
根据关键词搜索
|
||||
Search by keyword
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 第几页
|
||||
page_size: 分页size
|
||||
sort: 排序
|
||||
note_type: 搜索结果类型
|
||||
search_time: 搜索多久时间的结果
|
||||
keyword: Search keyword
|
||||
page: Page number
|
||||
page_size: Page size
|
||||
sort: Sorting method
|
||||
note_type: Search result type
|
||||
search_time: Time range for search results
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -232,10 +232,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
order_by: str = "score",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取内容的一级评论
|
||||
Get root-level comments for content
|
||||
Args:
|
||||
content_id: 内容ID
|
||||
content_type: 内容类型(answer, article, zvideo)
|
||||
content_id: Content ID
|
||||
content_type: Content type (answer, article, zvideo)
|
||||
offset:
|
||||
limit:
|
||||
order_by:
|
||||
@@ -262,7 +262,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
order_by: str = "sort",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论下的子评论
|
||||
Get child comments under a root comment
|
||||
Args:
|
||||
root_comment_id:
|
||||
offset:
|
||||
@@ -287,11 +287,11 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Get all root-level comments for a specified post, this method will retrieve all comment information under a post
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
content: Content detail object (question|article|video)
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback after completing one crawl
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -328,12 +328,12 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Get all sub-comments under specified comments
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
content: Content detail object (question|article|video)
|
||||
comments: Comment list
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback after completing one crawl
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -370,7 +370,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]:
|
||||
"""
|
||||
获取创作者信息
|
||||
Get creator information
|
||||
Args:
|
||||
url_token:
|
||||
|
||||
@@ -383,7 +383,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的回答
|
||||
Get creator's answers
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
@@ -405,7 +405,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的文章
|
||||
Get creator's articles
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
@@ -426,7 +426,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的视频
|
||||
Get creator's videos
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
@@ -446,11 +446,11 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有回答
|
||||
Get all answers by creator
|
||||
Args:
|
||||
creator: 创作者信息
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
creator: Creator information
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback after completing one crawl
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -481,7 +481,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有文章
|
||||
Get all articles by creator
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
@@ -515,7 +515,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有视频
|
||||
Get all videos by creator
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
@@ -548,7 +548,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
answer_id: str,
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取回答信息
|
||||
Get answer information
|
||||
Args:
|
||||
question_id:
|
||||
answer_id:
|
||||
@@ -562,7 +562,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取文章信息
|
||||
Get article information
|
||||
Args:
|
||||
article_id:
|
||||
|
||||
@@ -575,7 +575,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取视频信息
|
||||
Get video information
|
||||
Args:
|
||||
video_id:
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||||
self._extractor = ZhihuExtractor()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
@@ -80,9 +80,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[ZhihuCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[ZhihuCrawler] Launching browser in CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -90,7 +90,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[ZhihuCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[ZhihuCrawler] Launching browser in standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
@@ -117,9 +117,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
|
||||
# Zhihu's search API requires opening the search page first to access cookies, homepage alone won't work
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
|
||||
"[ZhihuCrawler.start] Zhihu navigating to search page to get search page cookies, this process takes about 5 seconds"
|
||||
)
|
||||
await self.context_page.goto(
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
@@ -273,7 +273,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
)
|
||||
await zhihu_store.save_creator(creator=createor_info)
|
||||
|
||||
# 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可
|
||||
# By default, only answer information is extracted, uncomment below if articles and videos are needed
|
||||
|
||||
# Get all anwser information of the creator
|
||||
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
|
||||
@@ -315,7 +315,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
|
||||
)
|
||||
# judge note type
|
||||
# Judge note type
|
||||
note_type: str = judge_zhihu_url(full_note_url)
|
||||
if note_type == constant.ANSWER_NAME:
|
||||
question_id = full_note_url.split("/")[-3]
|
||||
@@ -412,7 +412,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return zhihu_client_obj
|
||||
|
||||
@@ -440,7 +440,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system Chrome stable version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -458,7 +458,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -469,15 +469,15 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[ZhihuCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[ZhihuCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[ZhihuCrawler] CDP mode launch failed, falling back to standard mode: {e}")
|
||||
# Fall back to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
@@ -485,7 +485,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# Special handling if using CDP mode
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -26,31 +26,31 @@ from constant import zhihu as zhihu_constant
|
||||
|
||||
class SearchTime(Enum):
|
||||
"""
|
||||
搜索时间范围
|
||||
Search time range
|
||||
"""
|
||||
DEFAULT = "" # 不限时间
|
||||
ONE_DAY = "a_day" # 一天内
|
||||
ONE_WEEK = "a_week" # 一周内
|
||||
ONE_MONTH = "a_month" # 一个月内
|
||||
THREE_MONTH = "three_months" # 三个月内
|
||||
HALF_YEAR = "half_a_year" # 半年内
|
||||
ONE_YEAR = "a_year" # 一年内
|
||||
DEFAULT = "" # No time limit
|
||||
ONE_DAY = "a_day" # Within one day
|
||||
ONE_WEEK = "a_week" # Within one week
|
||||
ONE_MONTH = "a_month" # Within one month
|
||||
THREE_MONTH = "three_months" # Within three months
|
||||
HALF_YEAR = "half_a_year" # Within half a year
|
||||
ONE_YEAR = "a_year" # Within one year
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
"""
|
||||
搜索结果类型
|
||||
Search result type
|
||||
"""
|
||||
DEFAULT = "" # 不限类型
|
||||
ANSWER = zhihu_constant.ANSWER_NAME # 只看回答
|
||||
ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章
|
||||
VIDEO = zhihu_constant.VIDEO_NAME # 只看视频
|
||||
DEFAULT = "" # No type limit
|
||||
ANSWER = zhihu_constant.ANSWER_NAME # Answers only
|
||||
ARTICLE = zhihu_constant.ARTICLE_NAME # Articles only
|
||||
VIDEO = zhihu_constant.VIDEO_NAME # Videos only
|
||||
|
||||
|
||||
class SearchSort(Enum):
|
||||
"""
|
||||
搜索结果排序
|
||||
Search result sorting
|
||||
"""
|
||||
DEFAULT = "" # 综合排序
|
||||
UPVOTED_COUNT = "upvoted_count" # 最多赞同
|
||||
CREATE_TIME = "created_time" # 最新发布
|
||||
DEFAULT = "" # Default sorting
|
||||
UPVOTED_COUNT = "upvoted_count" # Most upvoted
|
||||
CREATE_TIME = "created_time" # Latest published
|
||||
|
||||
@@ -168,7 +168,7 @@ class ZhihuExtractor:
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # This indicates data from the creator's homepage video list API
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
||||
res.created_time = zvideo.get("published_at")
|
||||
res.updated_time = zvideo.get("updated_at")
|
||||
@@ -318,11 +318,11 @@ class ZhihuExtractor:
|
||||
|
||||
"""
|
||||
if gender == 1:
|
||||
return "男"
|
||||
return "Male"
|
||||
elif gender == 0:
|
||||
return "女"
|
||||
return "Female"
|
||||
else:
|
||||
return "未知"
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]:
|
||||
|
||||
Reference in New Issue
Block a user