mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-08 19:07:33 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -23,21 +23,21 @@ from enum import Enum
|
||||
|
||||
class SearchChannelType(Enum):
|
||||
"""search channel type"""
|
||||
GENERAL = "aweme_general" # 综合
|
||||
VIDEO = "aweme_video_web" # 视频
|
||||
USER = "aweme_user_web" # 用户
|
||||
LIVE = "aweme_live" # 直播
|
||||
GENERAL = "aweme_general" # General
|
||||
VIDEO = "aweme_video_web" # Video
|
||||
USER = "aweme_user_web" # User
|
||||
LIVE = "aweme_live" # Live
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
GENERAL = 0 # 综合排序
|
||||
MOST_LIKE = 1 # 最多点赞
|
||||
LATEST = 2 # 最新发布
|
||||
GENERAL = 0 # Comprehensive sorting
|
||||
MOST_LIKE = 1 # Most likes
|
||||
LATEST = 2 # Latest published
|
||||
|
||||
class PublishTimeType(Enum):
|
||||
"""publish time type"""
|
||||
UNLIMITED = 0 # 不限
|
||||
ONE_DAY = 1 # 一天内
|
||||
ONE_WEEK = 7 # 一周内
|
||||
SIX_MONTH = 180 # 半年内
|
||||
UNLIMITED = 0 # Unlimited
|
||||
ONE_DAY = 1 # Within one day
|
||||
ONE_WEEK = 7 # Within one week
|
||||
SIX_MONTH = 180 # Within six months
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||
# @Desc : Get a_bogus parameter, for learning and communication only, do not use for commercial purposes, contact author to delete if infringement
|
||||
|
||||
import random
|
||||
import re
|
||||
@@ -38,7 +38,7 @@ douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').re
|
||||
|
||||
def get_web_id():
|
||||
"""
|
||||
生成随机的webid
|
||||
Generate random webid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -60,13 +60,13 @@ def get_web_id():
|
||||
|
||||
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||
"""
|
||||
获取 a_bogus 参数, 目前不支持post请求类型的签名
|
||||
Get a_bogus parameter, currently does not support POST request type signature
|
||||
"""
|
||||
return get_a_bogus_from_js(url, params, user_agent)
|
||||
|
||||
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
"""
|
||||
通过js获取 a_bogus 参数
|
||||
Get a_bogus parameter through js
|
||||
Args:
|
||||
url:
|
||||
params:
|
||||
@@ -84,8 +84,8 @@ def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
|
||||
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||
"""
|
||||
通过playright获取 a_bogus 参数
|
||||
playwright版本已失效
|
||||
Get a_bogus parameter through playwright
|
||||
playwright version is deprecated
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -100,73 +100,73 @@ async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: s
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从抖音视频URL中解析出视频ID
|
||||
支持以下格式:
|
||||
1. 普通视频链接: https://www.douyin.com/video/7525082444551310602
|
||||
2. 带modal_id参数的链接:
|
||||
Parse video ID from Douyin video URL
|
||||
Supports the following formats:
|
||||
1. Normal video link: https://www.douyin.com/video/7525082444551310602
|
||||
2. Link with modal_id parameter:
|
||||
- https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?modal_id=7525082444551310602
|
||||
- https://www.douyin.com/root/search/python?modal_id=7471165520058862848
|
||||
3. 短链接: https://v.douyin.com/iF12345ABC/ (需要client解析)
|
||||
4. 纯ID: 7525082444551310602
|
||||
3. Short link: https://v.douyin.com/iF12345ABC/ (requires client parsing)
|
||||
4. Pure ID: 7525082444551310602
|
||||
|
||||
Args:
|
||||
url: 抖音视频链接或ID
|
||||
url: Douyin video link or ID
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
VideoUrlInfo: Object containing video ID
|
||||
"""
|
||||
# 如果是纯数字ID,直接返回
|
||||
# If it's a pure numeric ID, return directly
|
||||
if url.isdigit():
|
||||
return VideoUrlInfo(aweme_id=url, url_type="normal")
|
||||
|
||||
# 检查是否是短链接 (v.douyin.com)
|
||||
# Check if it's a short link (v.douyin.com)
|
||||
if "v.douyin.com" in url or url.startswith("http") and len(url) < 50 and "video" not in url:
|
||||
return VideoUrlInfo(aweme_id="", url_type="short") # 需要通过client解析
|
||||
return VideoUrlInfo(aweme_id="", url_type="short") # Requires client parsing
|
||||
|
||||
# 尝试从URL参数中提取modal_id
|
||||
# Try to extract modal_id from URL parameters
|
||||
params = extract_url_params_to_dict(url)
|
||||
modal_id = params.get("modal_id")
|
||||
if modal_id:
|
||||
return VideoUrlInfo(aweme_id=modal_id, url_type="modal")
|
||||
|
||||
# 从标准视频URL中提取ID: /video/数字
|
||||
# Extract ID from standard video URL: /video/number
|
||||
video_pattern = r'/video/(\d+)'
|
||||
match = re.search(video_pattern, url)
|
||||
if match:
|
||||
aweme_id = match.group(1)
|
||||
return VideoUrlInfo(aweme_id=aweme_id, url_type="normal")
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
raise ValueError(f"Unable to parse video ID from URL: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从抖音创作者主页URL中解析出创作者ID (sec_user_id)
|
||||
支持以下格式:
|
||||
1. 创作者主页: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main
|
||||
2. 纯ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE
|
||||
Parse creator ID (sec_user_id) from Douyin creator homepage URL
|
||||
Supports the following formats:
|
||||
1. Creator homepage: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main
|
||||
2. Pure ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE
|
||||
|
||||
Args:
|
||||
url: 抖音创作者主页链接或sec_user_id
|
||||
url: Douyin creator homepage link or sec_user_id
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
CreatorUrlInfo: Object containing creator ID
|
||||
"""
|
||||
# 如果是纯ID格式(通常以MS4wLjABAAAA开头),直接返回
|
||||
# If it's a pure ID format (usually starts with MS4wLjABAAAA), return directly
|
||||
if url.startswith("MS4wLjABAAAA") or (not url.startswith("http") and "douyin.com" not in url):
|
||||
return CreatorUrlInfo(sec_user_id=url)
|
||||
|
||||
# 从创作者主页URL中提取sec_user_id: /user/xxx
|
||||
# Extract sec_user_id from creator homepage URL: /user/xxx
|
||||
user_pattern = r'/user/([^/?]+)'
|
||||
match = re.search(user_pattern, url)
|
||||
if match:
|
||||
sec_user_id = match.group(1)
|
||||
return CreatorUrlInfo(sec_user_id=sec_user_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
raise ValueError(f"Unable to parse creator ID from URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试视频URL解析
|
||||
print("=== 视频URL解析测试 ===")
|
||||
# Test video URL parsing
|
||||
print("=== Video URL Parsing Test ===")
|
||||
test_urls = [
|
||||
"https://www.douyin.com/video/7525082444551310602",
|
||||
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525082444551310602",
|
||||
@@ -177,13 +177,13 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_video_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
# 测试创作者URL解析
|
||||
print("=== 创作者URL解析测试 ===")
|
||||
# Test creator URL parsing
|
||||
print("=== Creator URL Parsing Test ===")
|
||||
test_creator_urls = [
|
||||
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
|
||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||
@@ -192,7 +192,7 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_creator_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
@@ -53,7 +53,7 @@ class DouYinLogin(AbstractLogin):
|
||||
async def begin(self):
|
||||
"""
|
||||
Start login douyin website
|
||||
滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录
|
||||
The verification accuracy of the slider verification is not very good... If there are no special requirements, it is recommended not to use Douyin login, or use cookie login
|
||||
"""
|
||||
|
||||
# popup login dialog
|
||||
@@ -69,7 +69,7 @@ class DouYinLogin(AbstractLogin):
|
||||
else:
|
||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
|
||||
# If the page redirects to the slider verification page, need to slide again
|
||||
await asyncio.sleep(6)
|
||||
current_page_title = await self.context_page.title()
|
||||
if "验证码中间页" in current_page_title:
|
||||
@@ -147,10 +147,10 @@ class DouYinLogin(AbstractLogin):
|
||||
send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']")
|
||||
await send_sms_code_btn.click()
|
||||
|
||||
# 检查是否有滑动验证码
|
||||
# Check if there is slider verification
|
||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
max_get_sms_code_time = 60 * 2 # Maximum time to get verification code is 2 minutes
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
@@ -164,20 +164,20 @@ class DouYinLogin(AbstractLogin):
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||
await asyncio.sleep(0.5)
|
||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
await submit_btn_ele.click() # Click login
|
||||
# todo ... should also check the correctness of the verification code, it may be incorrect
|
||||
break
|
||||
|
||||
async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"):
|
||||
"""
|
||||
检查页面是否出现滑动验证码
|
||||
Check if slider verification appears on the page
|
||||
:return:
|
||||
"""
|
||||
# 等待滑动验证码的出现
|
||||
# Wait for slider verification to appear
|
||||
back_selector = "#captcha-verify-image"
|
||||
try:
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000)
|
||||
except PlaywrightTimeoutError: # 没有滑动验证码,直接返回
|
||||
except PlaywrightTimeoutError: # No slider verification, return directly
|
||||
return
|
||||
|
||||
gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]'
|
||||
@@ -191,16 +191,16 @@ class DouYinLogin(AbstractLogin):
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
|
||||
# If the slider is too slow or verification failed, it will prompt "操作过慢", click the refresh button here
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
|
||||
continue
|
||||
|
||||
# 滑动成功后,等待滑块消失
|
||||
# After successful sliding, wait for the slider to disappear
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
|
||||
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
|
||||
# If the slider disappears, it means the verification is successful, break the loop. If not, it means the verification failed, the above line will throw an exception and be caught to continue the loop
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
|
||||
slider_verify_success = True
|
||||
except Exception as e:
|
||||
@@ -213,10 +213,10 @@ class DouYinLogin(AbstractLogin):
|
||||
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
|
||||
"""
|
||||
Move the slider to the right to complete the verification
|
||||
:param back_selector: 滑动验证码背景图片的选择器
|
||||
:param gap_selector: 滑动验证码的滑块选择器
|
||||
:param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
:param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块
|
||||
:param back_selector: Selector for the slider verification background image
|
||||
:param gap_selector: Selector for the slider verification slider
|
||||
:param move_step: Controls the ratio of single movement speed, default is 1, meaning the distance moves in 0.1 seconds no matter how far, larger value means slower
|
||||
:param slider_level: Slider difficulty easy hard, corresponding to the slider for mobile verification code and the slider in the middle of verification code
|
||||
:return:
|
||||
"""
|
||||
|
||||
@@ -234,31 +234,31 @@ class DouYinLogin(AbstractLogin):
|
||||
)
|
||||
gap_src = str(await gap_elements.get_property("src")) # type: ignore
|
||||
|
||||
# 识别滑块位置
|
||||
# Identify slider position
|
||||
slide_app = utils.Slide(gap=gap_src, bg=slide_back)
|
||||
distance = slide_app.discern()
|
||||
|
||||
# 获取移动轨迹
|
||||
# Get movement trajectory
|
||||
tracks = utils.get_tracks(distance, slider_level)
|
||||
new_1 = tracks[-1] - (sum(tracks) - distance)
|
||||
tracks.pop()
|
||||
tracks.append(new_1)
|
||||
|
||||
# 根据轨迹拖拽滑块到指定位置
|
||||
# Drag slider to specified position according to trajectory
|
||||
element = await self.context_page.query_selector(gap_selector)
|
||||
bounding_box = await element.bounding_box() # type: ignore
|
||||
|
||||
await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore
|
||||
bounding_box["y"] + bounding_box["height"] / 2) # type: ignore
|
||||
# 这里获取到x坐标中心点位置
|
||||
# Get x coordinate center position
|
||||
x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore
|
||||
# 模拟滑动操作
|
||||
# Simulate sliding operation
|
||||
await element.hover() # type: ignore
|
||||
await self.context_page.mouse.down()
|
||||
|
||||
for track in tracks:
|
||||
# 循环鼠标按照轨迹移动
|
||||
# steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
# Loop mouse movement according to trajectory
|
||||
# steps controls the ratio of single movement speed, default is 1, meaning the distance moves in 0.1 seconds no matter how far, larger value means slower
|
||||
await self.context_page.mouse.move(x + track, 0, steps=move_step)
|
||||
x += track
|
||||
await self.context_page.mouse.up()
|
||||
|
||||
Reference in New Issue
Block a user