i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase:

- api/: FastAPI server documentation and logger messages
- cache/: Cache abstraction layer comments and docstrings
- database/: Database models and MongoDB store documentation
- media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu)
- model/: Data model documentation
- proxy/: Proxy pool and provider documentation
- store/: Data storage layer comments
- tools/: Utility functions and browser automation
- test/: Test file documentation

Preserved: Chinese disclaimer header (lines 10-18) for legal compliance

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
程序员阿江(Relakkes)
2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions

View File

@@ -60,14 +60,14 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
self.default_headers = headers
self.cookie_dict = cookie_dict
self._extractor = ZhihuExtractor()
# 初始化代理池(来自 ProxyRefreshMixin
# Initialize proxy pool (from ProxyRefreshMixin)
self.init_proxy_pool(proxy_ip_pool)
async def _pre_headers(self, url: str) -> Dict:
"""
请求头参数签名
Sign request headers
Args:
url: 请求的URL需要包含请求的参数
url: Request URL with query parameters
Returns:
"""
@@ -83,16 +83,16 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def request(self, method, url, **kwargs) -> Union[str, Any]:
"""
封装httpx的公共请求方法对请求响应做一些处理
Wrapper for httpx common request method with response handling
Args:
method: 请求方法
url: 请求的URL
**kwargs: 其他请求参数,例如请求头、请求体等
method: Request method
url: Request URL
**kwargs: Other request parameters such as headers, body, etc.
Returns:
"""
# 每次请求前检测代理是否过期
# Check if proxy is expired before each request
await self._refresh_proxy_if_expired()
# return response.text
@@ -105,7 +105,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}")
if response.status_code == 403:
raise ForbiddenError(response.text)
elif response.status_code == 404: # 如果一个content没有评论也是404
elif response.status_code == 404: # Content without comments also returns 404
return {}
raise DataFetchError(response.text)
@@ -124,10 +124,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]:
"""
GET请求,对请求头签名
GET request with header signing
Args:
uri: 请求路由
params: 请求参数
uri: Request URI
params: Request parameters
Returns:
@@ -141,7 +141,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def pong(self) -> bool:
"""
用于检查登录态是否失效了
Check if login status is still valid
Returns:
"""
@@ -161,9 +161,9 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def update_cookies(self, browser_context: BrowserContext):
"""
API客户端提供的更新cookies方法一般情况下登录成功后会调用此方法
Update cookies method provided by API client, typically called after successful login
Args:
browser_context: 浏览器上下文对象
browser_context: Browser context object
Returns:
@@ -174,7 +174,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_current_user_info(self) -> Dict:
"""
获取当前登录用户信息
Get current logged-in user information
Returns:
"""
@@ -191,14 +191,14 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
search_time: SearchTime = SearchTime.DEFAULT,
) -> List[ZhihuContent]:
"""
根据关键词搜索
Search by keyword
Args:
keyword: 关键词
page: 第几页
page_size: 分页size
sort: 排序
note_type: 搜索结果类型
search_time: 搜索多久时间的结果
keyword: Search keyword
page: Page number
page_size: Page size
sort: Sorting method
note_type: Search result type
search_time: Time range for search results
Returns:
@@ -232,10 +232,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
order_by: str = "score",
) -> Dict:
"""
获取内容的一级评论
Get root-level comments for content
Args:
content_id: 内容ID
content_type: 内容类型(answer, article, zvideo)
content_id: Content ID
content_type: Content type (answer, article, zvideo)
offset:
limit:
order_by:
@@ -262,7 +262,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
order_by: str = "sort",
) -> Dict:
"""
获取一级评论下的子评论
Get child comments under a root comment
Args:
root_comment_id:
offset:
@@ -287,11 +287,11 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
callback: Optional[Callable] = None,
) -> List[ZhihuComment]:
"""
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
Get all root-level comments for a specified post, this method will retrieve all comment information under a post
Args:
content: 内容详情对象(问题|文章|视频)
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后
content: Content detail object (question|article|video)
crawl_interval: Crawl delay interval in seconds
callback: Callback after completing one crawl
Returns:
@@ -328,12 +328,12 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
callback: Optional[Callable] = None,
) -> List[ZhihuComment]:
"""
获取指定评论下的所有子评论
Get all sub-comments under specified comments
Args:
content: 内容详情对象(问题|文章|视频)
comments: 评论列表
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后
content: Content detail object (question|article|video)
comments: Comment list
crawl_interval: Crawl delay interval in seconds
callback: Callback after completing one crawl
Returns:
@@ -370,7 +370,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]:
"""
获取创作者信息
Get creator information
Args:
url_token:
@@ -383,7 +383,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
"""
获取创作者的回答
Get creator's answers
Args:
url_token:
offset:
@@ -405,7 +405,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
"""
获取创作者的文章
Get creator's articles
Args:
url_token:
offset:
@@ -426,7 +426,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
"""
获取创作者的视频
Get creator's videos
Args:
url_token:
offset:
@@ -446,11 +446,11 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]:
"""
获取创作者的所有回答
Get all answers by creator
Args:
creator: 创作者信息
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后
creator: Creator information
crawl_interval: Crawl delay interval in seconds
callback: Callback after completing one crawl
Returns:
@@ -481,7 +481,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
callback: Optional[Callable] = None,
) -> List[ZhihuContent]:
"""
获取创作者的所有文章
Get all articles by creator
Args:
creator:
crawl_interval:
@@ -515,7 +515,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
callback: Optional[Callable] = None,
) -> List[ZhihuContent]:
"""
获取创作者的所有视频
Get all videos by creator
Args:
creator:
crawl_interval:
@@ -548,7 +548,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
answer_id: str,
) -> Optional[ZhihuContent]:
"""
获取回答信息
Get answer information
Args:
question_id:
answer_id:
@@ -562,7 +562,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
"""
获取文章信息
Get article information
Args:
article_id:
@@ -575,7 +575,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
"""
获取视频信息
Get video information
Args:
video_id:

View File

@@ -61,7 +61,7 @@ class ZhihuCrawler(AbstractCrawler):
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
self._extractor = ZhihuExtractor()
self.cdp_manager = None
self.ip_proxy_pool = None # 代理IP池用于代理自动刷新
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
async def start(self) -> None:
"""
@@ -80,9 +80,9 @@ class ZhihuCrawler(AbstractCrawler):
)
async with async_playwright() as playwright:
# 根据配置选择启动模式
# Choose launch mode based on configuration
if config.ENABLE_CDP_MODE:
utils.logger.info("[ZhihuCrawler] 使用CDP模式启动浏览器")
utils.logger.info("[ZhihuCrawler] Launching browser in CDP mode")
self.browser_context = await self.launch_browser_with_cdp(
playwright,
playwright_proxy_format,
@@ -90,7 +90,7 @@ class ZhihuCrawler(AbstractCrawler):
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[ZhihuCrawler] 使用标准模式启动浏览器")
utils.logger.info("[ZhihuCrawler] Launching browser in standard mode")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
@@ -117,9 +117,9 @@ class ZhihuCrawler(AbstractCrawler):
browser_context=self.browser_context
)
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API单独的首页不行
# Zhihu's search API requires opening the search page first to access cookies, homepage alone won't work
utils.logger.info(
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies该过程需要5秒左右"
"[ZhihuCrawler.start] Zhihu navigating to search page to get search page cookies, this process takes about 5 seconds"
)
await self.context_page.goto(
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
@@ -273,7 +273,7 @@ class ZhihuCrawler(AbstractCrawler):
)
await zhihu_store.save_creator(creator=createor_info)
# 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可
# By default, only answer information is extracted, uncomment below if articles and videos are needed
# Get all anwser information of the creator
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
@@ -315,7 +315,7 @@ class ZhihuCrawler(AbstractCrawler):
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
)
# judge note type
# Judge note type
note_type: str = judge_zhihu_url(full_note_url)
if note_type == constant.ANSWER_NAME:
question_id = full_note_url.split("/")[-3]
@@ -412,7 +412,7 @@ class ZhihuCrawler(AbstractCrawler):
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
)
return zhihu_client_obj
@@ -440,7 +440,7 @@ class ZhihuCrawler(AbstractCrawler):
proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080},
user_agent=user_agent,
channel="chrome", # 使用系统的Chrome稳定版
channel="chrome", # Use system Chrome stable version
)
return browser_context
else:
@@ -458,7 +458,7 @@ class ZhihuCrawler(AbstractCrawler):
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
Launch browser using CDP mode
"""
try:
self.cdp_manager = CDPBrowserManager()
@@ -469,15 +469,15 @@ class ZhihuCrawler(AbstractCrawler):
headless=headless,
)
# 显示浏览器信息
# Display browser information
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[ZhihuCrawler] CDP浏览器信息: {browser_info}")
utils.logger.info(f"[ZhihuCrawler] CDP browser info: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(f"[ZhihuCrawler] CDP模式启动失败,回退到标准模式: {e}")
# 回退到标准模式
utils.logger.error(f"[ZhihuCrawler] CDP mode launch failed, falling back to standard mode: {e}")
# Fall back to standard mode
chromium = playwright.chromium
return await self.launch_browser(
chromium, playwright_proxy, user_agent, headless
@@ -485,7 +485,7 @@ class ZhihuCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
# 如果使用CDP模式需要特殊处理
# Special handling if using CDP mode
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None

View File

@@ -26,31 +26,31 @@ from constant import zhihu as zhihu_constant
class SearchTime(Enum):
"""
搜索时间范围
Search time range
"""
DEFAULT = "" # 不限时间
ONE_DAY = "a_day" # 一天内
ONE_WEEK = "a_week" # 一周内
ONE_MONTH = "a_month" # 一个月内
THREE_MONTH = "three_months" # 三个月内
HALF_YEAR = "half_a_year" # 半年内
ONE_YEAR = "a_year" # 一年内
DEFAULT = "" # No time limit
ONE_DAY = "a_day" # Within one day
ONE_WEEK = "a_week" # Within one week
ONE_MONTH = "a_month" # Within one month
THREE_MONTH = "three_months" # Within three months
HALF_YEAR = "half_a_year" # Within half a year
ONE_YEAR = "a_year" # Within one year
class SearchType(Enum):
"""
搜索结果类型
Search result type
"""
DEFAULT = "" # 不限类型
ANSWER = zhihu_constant.ANSWER_NAME # 只看回答
ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章
VIDEO = zhihu_constant.VIDEO_NAME # 只看视频
DEFAULT = "" # No type limit
ANSWER = zhihu_constant.ANSWER_NAME # Answers only
ARTICLE = zhihu_constant.ARTICLE_NAME # Articles only
VIDEO = zhihu_constant.VIDEO_NAME # Videos only
class SearchSort(Enum):
"""
搜索结果排序
Search result sorting
"""
DEFAULT = "" # 综合排序
UPVOTED_COUNT = "upvoted_count" # 最多赞同
CREATE_TIME = "created_time" # 最新发布
DEFAULT = "" # Default sorting
UPVOTED_COUNT = "upvoted_count" # Most upvoted
CREATE_TIME = "created_time" # Latest published

View File

@@ -168,7 +168,7 @@ class ZhihuExtractor:
"""
res = ZhihuContent()
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # This indicates data from the creator's homepage video list API
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
res.created_time = zvideo.get("published_at")
res.updated_time = zvideo.get("updated_at")
@@ -318,11 +318,11 @@ class ZhihuExtractor:
"""
if gender == 1:
return ""
return "Male"
elif gender == 0:
return ""
return "Female"
else:
return "未知"
return "Unknown"
def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]: