i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase:

- api/: FastAPI server documentation and logger messages
- cache/: Cache abstraction layer comments and docstrings
- database/: Database models and MongoDB store documentation
- media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu)
- model/: Data model documentation
- proxy/: Proxy pool and provider documentation
- store/: Data storage layer comments
- tools/: Utility functions and browser automation
- test/: Test file documentation

Preserved: Chinese disclaimer header (lines 10-18) for legal compliance

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
程序员阿江(Relakkes)
2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions

View File

@@ -45,7 +45,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
def __init__(
self,
timeout=60, # 若开启爬取媒体选项xhs 的长视频需要更久的超时时间
timeout=60, # If media crawling is enabled, Xiaohongshu long videos need longer timeout
proxy=None,
*,
headers: Dict[str, str],
@@ -58,30 +58,30 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
self.headers = headers
self._host = "https://edith.xiaohongshu.com"
self._domain = "https://www.xiaohongshu.com"
self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
self.IP_ERROR_STR = "Network connection error, please check network settings or restart"
self.IP_ERROR_CODE = 300012
self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
self.NOTE_ABNORMAL_STR = "Note status abnormal, please check later"
self.NOTE_ABNORMAL_CODE = -510001
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
self._extractor = XiaoHongShuExtractor()
# 初始化代理池(来自 ProxyRefreshMixin
# Initialize proxy pool (from ProxyRefreshMixin)
self.init_proxy_pool(proxy_ip_pool)
async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
"""请求头参数签名(使用 playwright 注入方式)
"""Request header parameter signing (using playwright injection method)
Args:
url: 请求的URL
params: GET请求的参数
payload: POST请求的参数
url: Request URL
params: GET request parameters
payload: POST request parameters
Returns:
Dict: 请求头参数签名
Dict: Signed request header parameters
"""
a1_value = self.cookie_dict.get("a1", "")
# 确定请求数据、方法和 URI
# Determine request data, method and URI
if params is not None:
data = params
method = "GET"
@@ -91,7 +91,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
else:
raise ValueError("params or payload is required")
# 使用 playwright 注入方式生成签名
# Generate signature using playwright injection method
signs = await sign_with_playwright(
page=self.playwright_page,
uri=url,
@@ -112,16 +112,16 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def request(self, method, url, **kwargs) -> Union[str, Any]:
"""
封装httpx的公共请求方法对请求响应做一些处理
Wrapper for httpx common request method, processes request response
Args:
method: 请求方法
url: 请求的URL
**kwargs: 其他请求参数,例如请求头、请求体等
method: Request method
url: Request URL
**kwargs: Other request parameters, such as headers, body, etc.
Returns:
"""
# 每次请求前检测代理是否过期
# Check if proxy is expired before each request
await self._refresh_proxy_if_expired()
# return response.text
@@ -133,7 +133,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
# someday someone maybe will bypass captcha
verify_type = response.headers["Verifytype"]
verify_uuid = response.headers["Verifyuuid"]
msg = f"出现验证码,请求失败,Verifytype: {verify_type}Verifyuuid: {verify_uuid}, Response: {response}"
msg = f"CAPTCHA appeared, request failed, Verifytype: {verify_type}, Verifyuuid: {verify_uuid}, Response: {response}"
utils.logger.error(msg)
raise Exception(msg)
@@ -150,10 +150,10 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
"""
GET请求,对请求头签名
GET request, signs request headers
Args:
uri: 请求路由
params: 请求参数
uri: Request route
params: Request parameters
Returns:
@@ -167,10 +167,10 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
"""
POST请求,对请求头签名
POST request, signs request headers
Args:
uri: 请求路由
data: 请求体参数
uri: Request route
data: Request body parameters
Returns:
@@ -186,7 +186,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
)
async def get_note_media(self, url: str) -> Union[bytes, None]:
# 请求前检测代理是否过期
# Check if proxy is expired before request
await self._refresh_proxy_if_expired()
async with httpx.AsyncClient(proxy=self.proxy) as client:
@@ -205,12 +205,12 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils.logger.error(
f"[XiaoHongShuClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}"
) # 保留原始异常类型名称,以便开发者调试
) # Keep original exception type name for developer debugging
return None
async def pong(self) -> bool:
"""
用于检查登录态是否失效了
Check if login state is still valid
Returns:
"""
@@ -218,7 +218,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
ping_flag = False
try:
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
note_card: Dict = await self.get_note_by_keyword(keyword="Xiaohongshu")
if note_card.get("items"):
ping_flag = True
except Exception as e:
@@ -230,9 +230,9 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
async def update_cookies(self, browser_context: BrowserContext):
"""
API客户端提供的更新cookies方法一般情况下登录成功后会调用此方法
Update cookies method provided by API client, usually called after successful login
Args:
browser_context: 浏览器上下文对象
browser_context: Browser context object
Returns:
@@ -251,13 +251,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
note_type: SearchNoteType = SearchNoteType.ALL,
) -> Dict:
"""
根据关键词搜索笔记
Search notes by keyword
Args:
keyword: 关键词参数
page: 分页第几页
page_size: 分页数据长度
sort: 搜索结果排序指定
note_type: 搜索的笔记类型
keyword: Keyword parameter
page: Page number
page_size: Page data length
sort: Search result sorting specification
note_type: Type of note to search
Returns:
@@ -280,11 +280,11 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
xsec_token: str,
) -> Dict:
"""
获取笔记详情API
Get note detail API
Args:
note_id:笔记ID
xsec_source: 渠道来源
xsec_token: 搜索关键字之后返回的比较列表中返回的token
note_id: Note ID
xsec_source: Channel source
xsec_token: Token returned from search keyword result list
Returns:
@@ -304,7 +304,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
if res and res.get("items"):
res_dict: Dict = res["items"][0]["note_card"]
return res_dict
# 爬取频繁了可能会出现有的笔记能有结果有的没有
# When crawling frequently, some notes may have results while others don't
utils.logger.error(
f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}"
)
@@ -317,11 +317,11 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
cursor: str = "",
) -> Dict:
"""
获取一级评论的API
Get first-level comments API
Args:
note_id: 笔记ID
xsec_token: 验证token
cursor: 分页游标
note_id: Note ID
xsec_token: Verification token
cursor: Pagination cursor
Returns:
@@ -345,13 +345,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
cursor: str = "",
):
"""
获取指定父评论下的子评论的API
Get sub-comments under specified parent comment API
Args:
note_id: 子评论的帖子ID
root_comment_id: 根评论ID
xsec_token: 验证token
num: 分页数量
cursor: 分页游标
note_id: Post ID of sub-comments
root_comment_id: Root comment ID
xsec_token: Verification token
num: Pagination quantity
cursor: Pagination cursor
Returns:
@@ -377,13 +377,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
max_count: int = 10,
) -> List[Dict]:
"""
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
Get all first-level comments under specified note, this method will continuously find all comment information under a post
Args:
note_id: 笔记ID
xsec_token: 验证token
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后
max_count: 一次笔记爬取的最大评论数量
note_id: Note ID
xsec_token: Verification token
crawl_interval: Crawl delay per note (seconds)
callback: Callback after one note crawl ends
max_count: Maximum number of comments to crawl per note
Returns:
"""
@@ -425,12 +425,12 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
Get all second-level comments under specified first-level comments, this method will continuously find all second-level comment information under first-level comments
Args:
comments: 评论列表
xsec_token: 验证token
crawl_interval: 爬取一次评论的延迟单位(秒)
callback: 一次评论爬取结束后
comments: Comment list
xsec_token: Verification token
crawl_interval: Crawl delay per comment (seconds)
callback: Callback after one comment crawl ends
Returns:
@@ -487,18 +487,18 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
self, user_id: str, xsec_token: str = "", xsec_source: str = ""
) -> Dict:
"""
通过解析网页版的用户主页HTML获取用户个人简要信息
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的解析它即可
Get user profile brief information by parsing user homepage HTML
The PC user homepage has window.__INITIAL_STATE__ variable, just parse it
Args:
user_id: 用户ID
xsec_token: 验证token (可选,如果URL中包含此参数则传入)
xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入)
user_id: User ID
xsec_token: Verification token (optional, pass if included in URL)
xsec_source: Channel source (optional, pass if included in URL)
Returns:
Dict: 创作者信息
Dict: Creator information
"""
# 构建URI,如果有xsec参数则添加到URL中
# Build URI, add xsec parameters to URL if available
uri = f"/user/profile/{user_id}"
if xsec_token and xsec_source:
uri = f"{uri}?xsec_token={xsec_token}&xsec_source={xsec_source}"
@@ -517,13 +517,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
xsec_source: str = "pc_feed",
) -> Dict:
"""
获取博主的笔记
Get creator's notes
Args:
creator: 博主ID
cursor: 上一页最后一条笔记的ID
page_size: 分页数据长度
xsec_token: 验证token
xsec_source: 渠道来源
creator: Creator ID
cursor: Last note ID from previous page
page_size: Page data length
xsec_token: Verification token
xsec_source: Channel source
Returns:
@@ -547,13 +547,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
xsec_source: str = "pc_feed",
) -> List[Dict]:
"""
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
Get all posts published by specified user, this method will continuously find all post information under a user
Args:
user_id: 用户ID
crawl_interval: 爬取一次的延迟单位(秒)
callback: 一次分页爬取结束后的更新回调函数
xsec_token: 验证token
xsec_source: 渠道来源
user_id: User ID
crawl_interval: Crawl delay (seconds)
callback: Update callback function after one pagination crawl ends
xsec_token: Verification token
xsec_source: Channel source
Returns:
@@ -602,9 +602,9 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
async def get_note_short_url(self, note_id: str) -> Dict:
"""
获取笔记的短链接
Get note short URL
Args:
note_id: 笔记ID
note_id: Note ID
Returns:
@@ -622,7 +622,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
enable_cookie: bool = False,
) -> Optional[Dict]:
"""
通过解析网页版的笔记详情页HTML获取笔记详情, 该接口可能会出现失败的情况这里尝试重试3次
Get note details by parsing note detail page HTML, this interface may fail, retry 3 times here
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
thanks for ReaJason
Args: