3 Commits

Author SHA1 Message Date
程序员阿江-Relakkes
94553fd818 Merge pull request #817 from wanzirong/dev
feat: 添加命令行参数控制评论爬取数量
2026-01-21 16:49:13 +08:00
wanzirong
90f72536ba refactor: 简化命令行参数命名
- 将 --max_comments_per_post 重命名为 --max_comments_count_singlenotes,与配置项名称保持一致
- 移除 --xhs_sort_type 参数(暂不需要)
- 保持代码简洁,减少不必要的功能
2026-01-21 16:30:07 +08:00
wanzirong
f7d27ab43a feat: 添加命令行参数支持
- 添加 --max_comments_per_post 参数用于控制每个帖子爬取的评论数量
- 添加 --xhs_sort_type 参数用于控制小红书排序方式
- 修复小红书 core.py 中 CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES 的导入方式
  从直接导入改为通过 config 模块访问,使命令行参数能正确生效
2026-01-21 16:23:47 +08:00
2 changed files with 10 additions and 2 deletions

View File

@@ -250,6 +250,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
rich_help_panel="Basic Configuration",
),
] = "",
max_comments_count_singlenotes: Annotated[
int,
typer.Option(
"--max_comments_count_singlenotes",
help="Maximum number of first-level comments to crawl per post/video",
rich_help_panel="Comment Configuration",
),
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) -> SimpleNamespace:
"""MediaCrawler 命令行入口"""
@@ -274,6 +282,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.CDP_HEADLESS = enable_headless
config.SAVE_DATA_OPTION = save_data_option.value
config.COOKIES = cookies
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
# Set platform-specific ID lists for detail/creator mode
if specified_id_list:

View File

@@ -34,7 +34,6 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
@@ -344,7 +343,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
xsec_token=xsec_token,
crawl_interval=crawl_interval,
callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
# Sleep after fetching comments