Merge pull request #817 from wanzirong/dev

feat: 添加命令行参数控制评论爬取数量
This commit is contained in:
程序员阿江-Relakkes
2026-01-21 16:49:13 +08:00
committed by GitHub
2 changed files with 10 additions and 2 deletions

View File

@@ -250,6 +250,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
rich_help_panel="Basic Configuration",
),
] = "",
max_comments_count_singlenotes: Annotated[
int,
typer.Option(
"--max_comments_count_singlenotes",
help="Maximum number of first-level comments to crawl per post/video",
rich_help_panel="Comment Configuration",
),
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) -> SimpleNamespace:
"""MediaCrawler 命令行入口"""
@@ -274,6 +282,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.CDP_HEADLESS = enable_headless
config.SAVE_DATA_OPTION = save_data_option.value
config.COOKIES = cookies
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
# Set platform-specific ID lists for detail/creator mode
if specified_id_list:

View File

@@ -34,7 +34,6 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
@@ -344,7 +343,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
xsec_token=xsec_token,
crawl_interval=crawl_interval,
callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
# Sleep after fetching comments