mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 23:21:33 +08:00
feat: 添加命令行参数支持
- 添加 --max_comments_per_post 参数用于控制每个帖子爬取的评论数量 - 添加 --xhs_sort_type 参数用于控制小红书排序方式 - 修复小红书 core.py 中 CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES 的导入方式 从直接导入改为通过 config 模块访问,使命令行参数能正确生效
This commit is contained in:
@@ -250,6 +250,22 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = "",
|
||||
max_comments_per_post: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--max_comments_per_post",
|
||||
help="Maximum number of first-level comments to crawl per post/video",
|
||||
rich_help_panel="Comment Configuration",
|
||||
),
|
||||
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
xhs_sort_type: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--xhs_sort_type",
|
||||
help="XiaoHongShu sort type (e.g., popularity_descending, time_descending)",
|
||||
rich_help_panel="Platform Specific Configuration",
|
||||
),
|
||||
] = "",
|
||||
) -> SimpleNamespace:
|
||||
"""MediaCrawler 命令行入口"""
|
||||
|
||||
@@ -274,6 +290,11 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
config.CDP_HEADLESS = enable_headless
|
||||
config.SAVE_DATA_OPTION = save_data_option.value
|
||||
config.COOKIES = cookies
|
||||
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_per_post
|
||||
|
||||
# Set XiaoHongShu sort type if specified
|
||||
if xhs_sort_type and platform == PlatformEnum.XHS:
|
||||
config.SORT_TYPE = xhs_sort_type
|
||||
|
||||
# Set platform-specific ID lists for detail/creator mode
|
||||
if specified_id_list:
|
||||
|
||||
@@ -34,7 +34,6 @@ from tenacity import RetryError
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import xhs as xhs_store
|
||||
@@ -344,7 +343,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
xsec_token=xsec_token,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=xhs_store.batch_update_xhs_note_comments,
|
||||
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
# Sleep after fetching comments
|
||||
|
||||
Reference in New Issue
Block a user