mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-07 02:17:25 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc : 微博爬虫 API 请求 client
|
||||
# @Desc : Weibo crawler API request client
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
@@ -49,7 +49,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
||||
timeout=60, # If media crawling is enabled, Weibo images need a longer timeout
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -64,12 +64,12 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
@retry(stop=stop_after_attempt(5), wait=wait_fixed(3))
|
||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
enable_return_response = kwargs.pop("return_response", False)
|
||||
@@ -82,7 +82,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except json.decoder.JSONDecodeError:
|
||||
# issue: #771 搜索接口会报错432, 多次重试 + 更新 h5 cookies
|
||||
# issue: #771 Search API returns error 432, retry multiple times + update h5 cookies
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err code: {response.status_code} res:{response.text}")
|
||||
await self.playwright_page.goto(self._host)
|
||||
await asyncio.sleep(2)
|
||||
@@ -156,9 +156,9 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
search note by keyword
|
||||
:param keyword: 微博搜搜的关键词
|
||||
:param page: 分页参数 -当前页码
|
||||
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
|
||||
:param keyword: Search keyword for Weibo
|
||||
:param page: Pagination parameter - current page number
|
||||
:param search_type: Search type, see SearchType enum in weibo/field.py
|
||||
:return:
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
@@ -172,9 +172,9 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
|
||||
"""get notes comments
|
||||
:param mid_id: 微博ID
|
||||
:param max_id: 分页参数ID
|
||||
:param max_id_type: 分页参数ID类型
|
||||
:param mid_id: Weibo ID
|
||||
:param max_id: Pagination parameter ID
|
||||
:param max_id_type: Pagination parameter ID type
|
||||
:return:
|
||||
"""
|
||||
uri = "/comments/hotflow"
|
||||
@@ -218,7 +218,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
is_end = max_id == 0
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If callback function exists, execute it
|
||||
await callback(note_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comment_list)
|
||||
@@ -233,7 +233,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取评论的所有子评论
|
||||
Get all sub-comments of comments
|
||||
Args:
|
||||
note_id:
|
||||
comment_list:
|
||||
@@ -256,7 +256,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||
"""
|
||||
根据帖子ID获取详情
|
||||
Get note details by note ID
|
||||
:param note_id:
|
||||
:return:
|
||||
"""
|
||||
@@ -273,22 +273,22 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
note_item = {"mblog": note_detail}
|
||||
return note_item
|
||||
else:
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] $render_data value not found")
|
||||
return dict()
|
||||
|
||||
async def get_note_image(self, image_url: str) -> bytes:
|
||||
image_url = image_url[8:] # 去掉 https://
|
||||
image_url = image_url[8:] # Remove https://
|
||||
sub_url = image_url.split("/")
|
||||
image_url = ""
|
||||
for i in range(len(sub_url)):
|
||||
if i == 1:
|
||||
image_url += "large/" # 都获取高清大图
|
||||
image_url += "large/" # Get high-resolution images
|
||||
elif i == len(sub_url) - 1:
|
||||
image_url += sub_url[i]
|
||||
else:
|
||||
image_url += sub_url[i] + "/"
|
||||
# 微博图床对外存在防盗链,所以需要代理访问
|
||||
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
||||
# Weibo image hosting has anti-hotlinking, so proxy access is needed
|
||||
# Since Weibo images are accessed through i1.wp.com, we need to concatenate the URL
|
||||
final_uri = (f"{self._image_agent_host}"
|
||||
f"{image_url}")
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -301,18 +301,18 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # Keep original exception type name for developer debugging
|
||||
return None
|
||||
|
||||
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
获取用户的容器ID, 容器信息代表着真实请求的API路径
|
||||
fid_container_id:用户的微博详情API的容器ID
|
||||
lfid_container_id:用户的微博列表API的容器ID
|
||||
Get user's container ID, container information represents the real API request path
|
||||
fid_container_id: Container ID for user's Weibo detail API
|
||||
lfid_container_id: Container ID for user's Weibo list API
|
||||
Args:
|
||||
creator_id:
|
||||
creator_id: User ID
|
||||
|
||||
Returns: {
|
||||
Returns: Dictionary with container IDs
|
||||
|
||||
"""
|
||||
response = await self.get(f"/u/{creator_id}", return_response=True)
|
||||
@@ -324,7 +324,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
根据用户ID获取用户详情
|
||||
Get user details by user ID
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
@@ -349,11 +349,11 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
since_id: str = "0",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Get creator's notes
|
||||
Args:
|
||||
creator: 博主ID
|
||||
container_id: 容器ID
|
||||
since_id: 上一页最后一条笔记的ID
|
||||
creator: Creator ID
|
||||
container_id: Container ID
|
||||
since_id: ID of the last note from previous page
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -376,14 +376,14 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Get all posts published by a specified user, this method will continuously fetch all posts from a user
|
||||
Args:
|
||||
creator_id:
|
||||
container_id:
|
||||
crawl_interval:
|
||||
callback:
|
||||
creator_id: Creator user ID
|
||||
container_id: Container ID for the user
|
||||
crawl_interval: Interval between requests in seconds
|
||||
callback: Optional callback function to process notes
|
||||
|
||||
Returns:
|
||||
Returns: List of all notes
|
||||
|
||||
"""
|
||||
result = []
|
||||
@@ -393,7 +393,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
while notes_has_more:
|
||||
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by Weibo, so they cannot access the data.")
|
||||
break
|
||||
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
|
||||
if "cards" not in notes_res:
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc : 微博爬虫主流程代码
|
||||
# @Desc : Weibo crawler main workflow code
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
@@ -63,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -73,9 +73,9 @@ class WeiboCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Select launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[WeiboCrawler] Launching browser with CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -83,7 +83,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[WeiboCrawler] Launching browser with standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
|
||||
@@ -109,11 +109,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||
)
|
||||
await login_obj.begin()
|
||||
|
||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||
# After successful login, redirect to mobile website and update mobile cookies
|
||||
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
await asyncio.sleep(3)
|
||||
# 只获取移动端的 cookies,避免 PC 端和移动端 cookies 混淆
|
||||
# Only get mobile cookies to avoid confusion between PC and mobile cookies
|
||||
await self.wb_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=[self.mobile_index_url]
|
||||
@@ -170,7 +170,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
# 如果开启了全文获取功能,则批量获取帖子全文
|
||||
# If full text fetching is enabled, batch get full text of posts
|
||||
note_list = await self.batch_get_notes_full_text(note_list)
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
@@ -315,9 +315,9 @@ class WeiboCrawler(AbstractCrawler):
|
||||
raise DataFetchError("Get creator info error")
|
||||
await weibo_store.save_creator(user_id, user_info=createor_info)
|
||||
|
||||
# 创建一个包装 callback,在保存数据前获取全文
|
||||
# Create a wrapper callback to get full text before saving data
|
||||
async def save_notes_with_full_text(note_list: List[Dict]):
|
||||
# 如果开启了全文获取功能,先批量获取全文
|
||||
# If full text fetching is enabled, batch get full text first
|
||||
updated_note_list = await self.batch_get_notes_full_text(note_list)
|
||||
await weibo_store.batch_update_weibo_notes(updated_note_list)
|
||||
|
||||
@@ -350,7 +350,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return weibo_client_obj
|
||||
|
||||
@@ -375,7 +375,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's Chrome stable version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -391,7 +391,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser with CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -402,24 +402,24 @@ class WeiboCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[WeiboCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[WeiboCrawler] CDP mode startup failed, falling back to standard mode: {e}")
|
||||
# Fallback to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def get_note_full_text(self, note_item: Dict) -> Dict:
|
||||
"""
|
||||
获取帖子全文内容
|
||||
如果帖子内容被截断(isLongText=True),则请求详情接口获取完整内容
|
||||
:param note_item: 帖子数据,包含 mblog 字段
|
||||
:return: 更新后的帖子数据
|
||||
Get full text content of a post
|
||||
If the post content is truncated (isLongText=True), request the detail API to get complete content
|
||||
:param note_item: Post data, contains mblog field
|
||||
:return: Updated post data
|
||||
"""
|
||||
if not config.ENABLE_WEIBO_FULL_TEXT:
|
||||
return note_item
|
||||
@@ -428,7 +428,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
if not mblog:
|
||||
return note_item
|
||||
|
||||
# 检查是否是长文本
|
||||
# Check if it's a long text
|
||||
is_long_text = mblog.get("isLongText", False)
|
||||
if not is_long_text:
|
||||
return note_item
|
||||
@@ -441,11 +441,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_full_text] Fetching full text for note: {note_id}")
|
||||
full_note = await self.wb_client.get_note_info_by_id(note_id)
|
||||
if full_note and full_note.get("mblog"):
|
||||
# 用完整内容替换原始内容
|
||||
# Replace original content with complete content
|
||||
note_item["mblog"] = full_note["mblog"]
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_full_text] Successfully fetched full text for note: {note_id}")
|
||||
|
||||
# 请求后休眠,避免风控
|
||||
# Sleep after request to avoid rate limiting
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_full_text] Failed to fetch full text for note {note_id}: {ex}")
|
||||
@@ -456,9 +456,9 @@ class WeiboCrawler(AbstractCrawler):
|
||||
|
||||
async def batch_get_notes_full_text(self, note_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
批量获取帖子全文内容
|
||||
:param note_list: 帖子列表
|
||||
:return: 更新后的帖子列表
|
||||
Batch get full text content of posts
|
||||
:param note_list: List of posts
|
||||
:return: Updated list of posts
|
||||
"""
|
||||
if not config.ENABLE_WEIBO_FULL_TEXT:
|
||||
return note_list
|
||||
@@ -471,7 +471,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# Special handling if using CDP mode
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -26,14 +26,14 @@ from enum import Enum
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
# 综合
|
||||
# Comprehensive
|
||||
DEFAULT = "1"
|
||||
|
||||
# 实时
|
||||
# Real-time
|
||||
REAL_TIME = "61"
|
||||
|
||||
# 热门
|
||||
# Popular
|
||||
POPULAR = "60"
|
||||
|
||||
# 视频
|
||||
# Video
|
||||
VIDEO = "64"
|
||||
|
||||
@@ -28,9 +28,9 @@ from typing import Dict, List
|
||||
|
||||
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
过滤微博搜索的结果,只保留card_type为9类型的数据
|
||||
:param card_list:
|
||||
:return:
|
||||
Filter Weibo search results, only keep data with card_type of 9
|
||||
:param card_list: List of card items from search results
|
||||
:return: Filtered list of note items
|
||||
"""
|
||||
note_list: List[Dict] = []
|
||||
for card_item in card_list:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:42
|
||||
# @Desc : 微博登录实现
|
||||
# @Desc : Weibo login implementation
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
|
||||
Reference in New Issue
Block a user