docs: translate comments and metadata to English

Update Chinese comments, variable descriptions, and metadata across
multiple configuration and core files to English. This improves
codebase accessibility for international developers. Additionally,
removed the sponsorship section from README files.
This commit is contained in:
程序员阿江(Relakkes)
2026-02-12 05:30:11 +08:00
parent 257743b016
commit d614ccf247
20 changed files with 140 additions and 181 deletions

View File

@@ -17,107 +17,107 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 基础配置
PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
# Basic configuration
PLATFORM = "xhs" # Platform, xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "编程副业,编程兼职" # Keyword search configuration, separated by English commas
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
CRAWLER_TYPE = (
"search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
"search" # Crawling type, search (keyword search) | detail (post details) | creator (creator homepage data)
)
# 是否开启 IP 代理
# Whether to enable IP proxy
ENABLE_IP_PROXY = False
# 代理IP池数量
# Number of proxy IP pools
IP_PROXY_POOL_COUNT = 2
# 代理IP提供商名称
# Proxy IP provider name
IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp
# 设置为True不会打开浏览器无头浏览器
# 设置False会打开一个浏览器
# 小红书如果一直扫码登录不通过,打开浏览器手动过一下滑动验证码
# 抖音如果一直提示失败,打开浏览器看下是否扫码登录之后出现了手机号验证,如果出现了手动过一下再试。
# Setting to True will not open the browser (headless browser)
# Setting False will open a browser
# If Xiaohongshu keeps scanning the code to log in but fails, open the browser and manually pass the sliding verification code.
# If Douyin keeps prompting failure, open the browser and see if mobile phone number verification appears after scanning the QR code to log in. If it does, manually go through it and try again.
HEADLESS = False
# 是否保存登录状态
# Whether to save login status
SAVE_LOGIN_STATE = True
# ==================== CDP (Chrome DevTools Protocol) 配置 ====================
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取提供更好的反检测能力
# 启用后将自动检测并启动用户的Chrome/Edge浏览器通过CDP协议进行控制
# 这种方式使用真实的浏览器环境包括用户的扩展、Cookie和设置大大降低被检测的风险
# ==================== CDP (Chrome DevTools Protocol) Configuration ====================
# Whether to enable CDP mode - use the user's existing Chrome/Edge browser to crawl, providing better anti-detection capabilities
# Once enabled, the user's Chrome/Edge browser will be automatically detected and started, and controlled through the CDP protocol.
# This method uses the real browser environment, including the user's extensions, cookies and settings, greatly reducing the risk of detection.
ENABLE_CDP_MODE = True
# CDP调试端口,用于与浏览器通信
# 如果端口被占用,系统会自动尝试下一个可用端口
# CDP debug port, used to communicate with the browser
# If the port is occupied, the system will automatically try the next available port
CDP_DEBUG_PORT = 9222
# 自定义浏览器路径(可选)
# 如果为空,系统会自动检测Chrome/Edge的安装路径
# Windows示例: "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
# macOS示例: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
# Custom browser path (optional)
# If it is empty, the system will automatically detect the installation path of Chrome/Edge
# Windows example: "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
# macOS example: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
CUSTOM_BROWSER_PATH = ""
# CDP模式下是否启用无头模式
# 注意即使设置为True某些反检测功能在无头模式下可能效果不佳
# Whether to enable headless mode in CDP mode
# NOTE: Even if set to True, some anti-detection features may not work well in headless mode
CDP_HEADLESS = False
# 浏览器启动超时时间(秒)
# Browser startup timeout (seconds)
BROWSER_LAUNCH_TIMEOUT = 60
# 是否在程序结束时自动关闭浏览器
# 设置为False可以保持浏览器运行便于调试
# Whether to automatically close the browser when the program ends
# Set to False to keep the browser running for easy debugging
AUTO_CLOSE_BROWSER = True
# 数据保存类型选项配置,支持六种类型csv、db、jsonsqliteexcelpostgres, 最好保存到DB有排重的功能。
# Data saving type option configuration, supports six types: csv, db, json, sqlite, excel, postgres. It is best to save to DB, with deduplication function.
SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres
# 数据保存路径,默认不指定,则保存到data文件夹下
# Data saving path, if not specified by default, it will be saved to the data folder.
SAVE_DATA_PATH = ""
# 用户浏览器缓存的浏览器文件配置
# Browser file configuration cached by the user's browser
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
# 爬取开始页数 默认从第一页开始
# The number of pages to start crawling starts from the first page by default
START_PAGE = 1
# 爬取视频/帖子的数量控制
# Control the number of crawled videos/posts
CRAWLER_MAX_NOTES_COUNT = 15
# 并发爬虫数量控制
# Controlling the number of concurrent crawlers
MAX_CONCURRENCY_NUM = 1
# 是否开启爬媒体模式(包含图片或视频资源),默认不开启爬媒体
# Whether to enable crawling media mode (including image or video resources), crawling media is not enabled by default
ENABLE_GET_MEIDAS = False
# 是否开启爬评论模式, 默认开启爬评论
# Whether to enable comment crawling mode. Comment crawling is enabled by default.
ENABLE_GET_COMMENTS = True
# 爬取一级评论的数量控制(单视频/帖子)
# Control the number of crawled first-level comments (single video/post)
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
# Whether to enable the mode of crawling second-level comments. By default, crawling of second-level comments is not enabled.
# If the old version of the project uses db, you need to refer to schema/tables.sql line 287 to add table fields.
ENABLE_GET_SUB_COMMENTS = False
# 词云相关
# 是否开启生成评论词云图
# word cloud related
# Whether to enable generating comment word clouds
ENABLE_GET_WORDCLOUD = False
# 自定义词语及其分组
# 添加规则xx:yy 其中xx为自定义添加的词组yy为将xx该词组分到的组名。
# Custom words and their groups
# Add rule: xx:yy where xx is a custom-added phrase, and yy is the group name to which the phrase xx is assigned.
CUSTOM_WORDS = {
"零几": "年份", # 将“零几”识别为一个整体
"高频词": "专业术语", # 示例自定义词
"零几": "年份", # Recognize "zero points" as a whole
"高频词": "专业术语", # Example custom words
}
# 停用(禁用)词文件路径
# Deactivate (disabled) word file path
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
# 中文字体文件路径
# Chinese font file path
FONT_PATH = "./docs/STZHONGS.TTF"
# 爬取间隔时间
# Crawl interval
CRAWLER_MAX_SLEEP_SEC = 2
from .bilibili_config import *

View File

@@ -16,15 +16,15 @@
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# bilili 平台配置
# bilili platform configuration
# 每天爬取视频/帖子的数量控制
# Control the number of videos/posts crawled per day
MAX_NOTES_PER_DAY = 1
# 指定B站视频URL列表 (支持完整URL或BV号)
# 示例:
# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
# - BV: "BV1d54y1g7db"
# Specify Bilibili video URL list (supports complete URL or BV number)
# Example:
# - Full URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
# - BV number: "BV1d54y1g7db"
BILI_SPECIFIED_ID_LIST = [
"https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
"BV1Sz4y1U77N",
@@ -32,9 +32,9 @@ BILI_SPECIFIED_ID_LIST = [
# ........................
]
# 指定B站创作者URL列表 (支持完整URL或UID)
# 示例:
# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
# Specify the URL list of Bilibili creators (supports full URL or UID)
# Example:
# - Full URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
# - UID: "20813884"
BILI_CREATOR_ID_LIST = [
"https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
@@ -42,26 +42,26 @@ BILI_CREATOR_ID_LIST = [
# ........................
]
# 指定时间范围
# Specify time range
START_DAY = "2024-01-01"
END_DAY = "2024-01-01"
# 搜索模式
# Search mode
BILI_SEARCH_MODE = "normal"
# 视频清晰度qn配置常见取值
# 16=360p, 32=480p, 64=720p, 80=1080p, 112=1080p高码率, 116=1080p60, 120=4K
# 注意:更高清晰度需要账号/视频本身支持
# Video definition (qn) configuration, common values:
# 16=360p, 32=480p, 64=720p, 80=1080p, 112=1080p high bit rate, 116=1080p60, 120=4K
# Note: Higher definition requires account/video support
BILI_QN = 80
# 是否爬取用户信息
# Whether to crawl user information
CREATOR_MODE = True
# 开始爬取用户信息页码
# Start crawling user information page number
START_CONTACTS_PAGE = 1
# 单个视频/帖子最大爬取评论数
# Maximum number of crawled comments for a single video/post
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
# 单个视频/帖子最大爬取动态数
# Maximum number of crawled dynamics for a single video/post
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50

View File

@@ -17,16 +17,16 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 抖音平台配置
# Douyin platform configuration
PUBLISH_TIME_TYPE = 0
# 指定DY视频URL列表 (支持多种格式)
# 支持格式:
# 1. 完整视频URL: "https://www.douyin.com/video/7525538910311632128"
# 2. modal_id的URL: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
# 3. 搜索页带modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
# 4. 短链接: "https://v.douyin.com/drIPtQ_WPWY/"
# 5. 纯视频ID: "7280854932641664319"
# Specify DY video URL list (supports multiple formats)
# Supported formats:
# 1. Full video URL: "https://www.douyin.com/video/7525538910311632128"
# 2. URL with modal_id: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
# 3. The search page has modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
# 4. Short link: "https://v.douyin.com/drIPtQ_WPWY/"
# 5. Pure video ID: "7280854932641664319"
DY_SPECIFIED_ID_LIST = [
"https://www.douyin.com/video/7525538910311632128",
"https://v.douyin.com/drIPtQ_WPWY/",
@@ -35,9 +35,9 @@ DY_SPECIFIED_ID_LIST = [
# ........................
]
# 指定DY创作者URL列表 (支持完整URL或sec_user_id)
# 支持格式:
# 1. 完整创作者主页URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
# Specify DY creator URL list (supports full URL or sec_user_id)
# Supported formats:
# 1. Complete creator homepage URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
# 2. sec_user_id: "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
DY_CREATOR_ID_LIST = [
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",

View File

@@ -17,22 +17,22 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 快手平台配置
# Kuaishou platform configuration
# 指定快手视频URL列表 (支持完整URL或纯ID)
# 支持格式:
# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
# 2. 纯视频ID: "3xf8enb8dbj6uig"
# Specify Kuaishou video URL list (supports complete URL or pure ID)
# Supported formats:
# 1. Full video URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
# 2. Pure video ID: "3xf8enb8dbj6uig"
KS_SPECIFIED_ID_LIST = [
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
"3xf8enb8dbj6uig",
# ........................
]
# 指定快手创作者URL列表 (支持完整URL或纯ID)
# 支持格式:
# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
# 2. user_id: "3x4sm73aye7jq7i"
# Specify Kuaishou creator URL list (supports full URL or pure ID)
# Supported formats:
# 1. Creator homepage URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
# 2. Pure user_id: "3x4sm73aye7jq7i"
KS_CREATOR_ID_LIST = [
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
"3x4sm73aye7jq7i",

View File

@@ -17,17 +17,17 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 贴吧平台配置
# Tieba platform configuration
# 指定贴吧ID列表
# Specify Tieba ID list
TIEBA_SPECIFIED_ID_LIST = []
# 指定贴吧名称列表
# Specify a list of Tieba names
TIEBA_NAME_LIST = [
# "盗墓笔记"
# "Tomb Robbery Notes"
]
# 指定贴吧用户URL列表
# Specify Tieba user URL list
TIEBA_CREATOR_URL_LIST = [
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
# ........................

View File

@@ -18,23 +18,23 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 微博平台配置
# Weibo platform configuration
# 搜索类型,具体的枚举值在media_platform/weibo/field.py
# Search type, the specific enumeration value is in media_platform/weibo/field.py
WEIBO_SEARCH_TYPE = "default"
# 指定微博ID列表
# Specify Weibo ID list
WEIBO_SPECIFIED_ID_LIST = [
"4982041758140155",
# ........................
]
# 指定微博用户ID列表
# Specify Weibo user ID list
WEIBO_CREATOR_ID_LIST = [
"5756404150",
# ........................
]
# 是否开启微博爬取全文的功能,默认开启
# 如果开启的话会增加被风控的概率,相当于一个关键词搜索请求会再遍历所有帖子的时候,再请求一次帖子详情
# Whether to enable the function of crawling the full text of Weibo. It is enabled by default.
# If turned on, it will increase the probability of being risk controlled, which is equivalent to a keyword search request that will traverse all posts and request the post details again.
ENABLE_WEIBO_FULL_TEXT = True

View File

@@ -18,18 +18,18 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 小红书平台配置
# Xiaohongshu platform configuration
# 排序方式,具体的枚举值在media_platform/xhs/field.py
# Sorting method, the specific enumeration value is in media_platform/xhs/field.py
SORT_TYPE = "popularity_descending"
# 指定笔记URL列表, 必须要携带xsec_token参数
# Specify the note URL list, which must carry the xsec_token parameter
XHS_SPECIFIED_NOTE_URL_LIST = [
"https://www.xiaohongshu.com/explore/64b95d01000000000c034587?xsec_token=AB0EFqJvINCkj6xOCKCQgfNNh8GdnBC_6XecG4QOddo3Q=&xsec_source=pc_cfeed"
# ........................
]
# 指定创作者URL列表需要携带xsec_tokenxsec_source参数
# Specify the creator URL list, which needs to carry xsec_token and xsec_source parameters.
XHS_CREATOR_ID_LIST = [
"https://www.xiaohongshu.com/user/profile/5f58bd990000000001003753?xsec_token=ABYVg1evluJZZzpMX-VWzchxQ1qSNVW3r-jOEnKqMcgZw=&xsec_source=pc_search"

View File

@@ -18,17 +18,17 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 知乎平台配置
# Zhihu platform configuration
# 指定知乎用户URL列表
# Specify Zhihu user URL list
ZHIHU_CREATOR_URL_LIST = [
"https://www.zhihu.com/people/yd1234567",
# ........................
]
# 指定知乎ID列表
# Specify Zhihu ID list
ZHIHU_SPECIFIED_ID_LIST = [
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
"https://zhuanlan.zhihu.com/p/673461588", # 文章
"https://www.zhihu.com/zvideo/1539542068422144000", # 视频
"https://www.zhihu.com/question/826896610/answer/4885821440", # answer
"https://zhuanlan.zhihu.com/p/673461588", # article
"https://www.zhihu.com/zvideo/1539542068422144000", # video
]