mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-07 02:17:25 +08:00
docs: translate comments and metadata to English
Update Chinese comments, variable descriptions, and metadata across multiple configuration and core files to English. This improves codebase accessibility for international developers. Additionally, removed the sponsorship section from README files.
This commit is contained in:
@@ -474,7 +474,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return bilibili_client_obj
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
|
||||
timeout=60, # If the crawl media option is turned on, Douyin’s short videos will require a longer timeout.
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict,
|
||||
@@ -57,7 +57,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self._host = "https://www.douyin.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def __process_req_params(
|
||||
@@ -103,7 +103,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
params.update(common_params)
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
|
||||
# 20240927 a-bogus更新(JS版本)
|
||||
# 20240927 a-bogus update (JS version)
|
||||
post_data = {}
|
||||
if request_method == "POST":
|
||||
post_data = params
|
||||
@@ -113,7 +113,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check whether the proxy has expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -266,13 +266,13 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
result.extend(comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute the callback function
|
||||
await callback(aweme_id, comments)
|
||||
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
continue
|
||||
# 获取二级评论
|
||||
# Get secondary reviews
|
||||
for comment in comments:
|
||||
reply_comment_total = comment.get("reply_comment_total")
|
||||
|
||||
@@ -290,7 +290,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
if not sub_comments:
|
||||
continue
|
||||
result.extend(sub_comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute the callback function
|
||||
await callback(aweme_id, sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return result
|
||||
@@ -343,7 +343,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # Keep the original exception type name for developers to debug
|
||||
return None
|
||||
|
||||
async def resolve_short_url(self, short_url: str) -> str:
|
||||
@@ -359,7 +359,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.info(f"[DouYinClient.resolve_short_url] Resolving short URL: {short_url}")
|
||||
response = await client.get(short_url, timeout=10)
|
||||
|
||||
# 短链接通常返回302重定向
|
||||
# Short links usually return a 302 redirect
|
||||
if response.status_code in [301, 302, 303, 307, 308]:
|
||||
redirect_url = response.headers.get("Location", "")
|
||||
utils.logger.info(f"[DouYinClient.resolve_short_url] Resolved to: {redirect_url}")
|
||||
|
||||
@@ -55,7 +55,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -65,7 +65,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Select startup mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
@@ -178,12 +178,12 @@ class DouYinCrawler(AbstractCrawler):
|
||||
try:
|
||||
video_info = parse_video_info_from_url(video_url)
|
||||
|
||||
# 处理短链接
|
||||
# Handling short links
|
||||
if video_info.url_type == "short":
|
||||
utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Resolving short link: {video_url}")
|
||||
resolved_url = await self.dy_client.resolve_short_url(video_url)
|
||||
if resolved_url:
|
||||
# 从解析后的URL中提取视频ID
|
||||
# Extract video ID from parsed URL
|
||||
video_info = parse_video_info_from_url(resolved_url)
|
||||
utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Short link resolved to aweme ID: {video_info.aweme_id}")
|
||||
else:
|
||||
@@ -240,7 +240,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||
# Pass the list of keywords to the get_aweme_all_comments method
|
||||
# Use fixed crawling interval
|
||||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
@@ -311,7 +311,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return douyin_client
|
||||
|
||||
@@ -361,10 +361,10 @@ class DouYinCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 添加反检测脚本
|
||||
# Add anti-detection script
|
||||
await self.cdp_manager.add_stealth_script()
|
||||
|
||||
# 显示浏览器信息
|
||||
# Show browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
@@ -372,13 +372,13 @@ class DouYinCrawler(AbstractCrawler):
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
# Fall back to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# If you use CDP mode, special processing is required
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
@@ -396,11 +396,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
|
||||
return
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
# List of note urls. If it is a short video type, an empty list will be returned.
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
# The video URL will always exist, but when it is a short video type, the file is actually an audio file.
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
|
||||
# TODO: Douyin does not adopt the audio and video separation strategy, so the audio can be separated from the original video and will not be extracted for the time being.
|
||||
if note_download_url:
|
||||
await self.get_aweme_images(aweme_item)
|
||||
else:
|
||||
@@ -416,7 +416,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
# List of note urls. If it is a short video type, an empty list will be returned.
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
|
||||
if not note_download_url:
|
||||
@@ -444,7 +444,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
# The video URL will always exist, but when it is a short video type, the file is actually an audio file.
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
|
||||
if not video_download_url:
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Name: Programmer Ajiang-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : Get a_bogus parameter, for learning and communication only, do not use for commercial purposes, contact author to delete if infringement
|
||||
|
||||
|
||||
@@ -191,7 +191,7 @@ class DouYinLogin(AbstractLogin):
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# If the slider is too slow or verification failed, it will prompt "操作过慢", click the refresh button here
|
||||
# If the slider is too slow or verification failed, it will prompt "The operation is too slow", click the refresh button here
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
|
||||
@@ -504,7 +504,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.warning(
|
||||
f"[XiaoHongShuClient.get_comments_all_sub_comments] Failed to get sub-comments for note_id: {note_id}, root_comment_id: {root_comment_id}, error: {e}. Skipping this comment's sub-comments."
|
||||
)
|
||||
break # 跳出当前评论的子评论获取循环,继续处理下一个评论
|
||||
break # Break out of the sub-comment acquisition loop of the current comment and continue processing the next comment
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuClient.get_comments_all_sub_comments] Unexpected error when getting sub-comments for note_id: {note_id}, root_comment_id: {root_comment_id}, error: {e}"
|
||||
@@ -514,7 +514,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuClient.get_comments_all_sub_comments] Error processing comment: {comment.get('id', 'unknown')}, error: {e}. Continuing with next comment."
|
||||
)
|
||||
continue # 继续处理下一个评论
|
||||
continue # Continue to next comment
|
||||
return result
|
||||
|
||||
async def get_creator_info(
|
||||
|
||||
Reference in New Issue
Block a user