feat: 启动任务接口添加帖子/视频数量与评论数量覆盖支持

This commit is contained in:
钟保罗
2026-05-19 20:57:07 +08:00
parent f328ee35b5
commit ec432eb63e
4 changed files with 126 additions and 0 deletions

View File

@@ -71,6 +71,8 @@ class CrawlerStartRequest(BaseModel):
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL
cookies: str = ""
headless: bool = False
max_notes_count: Optional[int] = None
max_comments_count: Optional[int] = None
class CrawlerStatusResponse(BaseModel):

View File

@@ -225,6 +225,12 @@ class CrawlerManager:
cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
if config.max_notes_count is not None:
cmd.extend(["--crawler_max_notes_count", str(config.max_notes_count)])
if config.max_comments_count is not None:
cmd.extend(["--max_comments_count_singlenotes", str(config.max_comments_count)])
if config.cookies:
cmd.extend(["--cookies", config.cookies])

View File

@@ -275,6 +275,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
rich_help_panel="Comment Configuration",
),
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
crawler_max_notes_count: Annotated[
int,
typer.Option(
"--crawler_max_notes_count",
help="Maximum number of videos/posts to crawl",
rich_help_panel="Basic Configuration",
),
] = config.CRAWLER_MAX_NOTES_COUNT,
max_concurrency_num: Annotated[
int,
typer.Option(
@@ -342,6 +350,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.SAVE_DATA_OPTION = save_data_option.value
config.COOKIES = cookies
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
config.CRAWLER_MAX_NOTES_COUNT = crawler_max_notes_count
config.MAX_CONCURRENCY_NUM = max_concurrency_num
config.SAVE_DATA_PATH = save_data_path
config.ENABLE_IP_PROXY = enable_ip_proxy_value

109
tests/test_api_limits.py Normal file
View File

@@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
import pytest
import config
from unittest.mock import AsyncMock, patch
from fastapi.testclient import TestClient
from cmd_arg import parse_cmd
from api.schemas import CrawlerStartRequest, PlatformEnum, LoginTypeEnum, CrawlerTypeEnum
from api.services.crawler_manager import CrawlerManager
from api.main import app
@pytest.mark.asyncio
async def test_cmd_arg_crawler_max_notes_count():
# Store original values
orig_notes = config.CRAWLER_MAX_NOTES_COUNT
orig_comments = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
try:
await parse_cmd([
"--platform", "xhs",
"--crawler_max_notes_count", "42",
"--max_comments_count_singlenotes", "24"
])
assert config.CRAWLER_MAX_NOTES_COUNT == 42
assert config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES == 24
finally:
config.CRAWLER_MAX_NOTES_COUNT = orig_notes
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = orig_comments
def test_crawler_manager_build_command():
cm = CrawlerManager()
# 1. No max limits passed in API request
req1 = CrawlerStartRequest(
platform=PlatformEnum.XHS,
login_type=LoginTypeEnum.QRCODE,
crawler_type=CrawlerTypeEnum.SEARCH,
keywords="test",
max_notes_count=None,
max_comments_count=None
)
cmd1 = cm._build_command(req1)
# Check that the custom arguments are NOT present
assert "--crawler_max_notes_count" not in cmd1
assert "--max_comments_count_singlenotes" not in cmd1
# 2. Both limits passed in API request
req2 = CrawlerStartRequest(
platform=PlatformEnum.XHS,
login_type=LoginTypeEnum.QRCODE,
crawler_type=CrawlerTypeEnum.SEARCH,
keywords="test",
max_notes_count=50,
max_comments_count=5
)
cmd2 = cm._build_command(req2)
# Check that they are correctly added
assert "--crawler_max_notes_count" in cmd2
idx_notes = cmd2.index("--crawler_max_notes_count")
assert cmd2[idx_notes + 1] == "50"
assert "--max_comments_count_singlenotes" in cmd2
idx_comments = cmd2.index("--max_comments_count_singlenotes")
assert cmd2[idx_comments + 1] == "5"
def test_api_start_crawler_with_limits():
client = TestClient(app)
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
mock_start.return_value = True
# Test case 1: with limits
response = client.post("/api/crawler/start", json={
"platform": "xhs",
"login_type": "qrcode",
"crawler_type": "search",
"keywords": "test",
"max_notes_count": 50,
"max_comments_count": 5
})
assert response.status_code == 200
assert response.json() == {"status": "ok", "message": "Crawler started successfully"}
mock_start.assert_called_once()
called_request = mock_start.call_args[0][0]
assert called_request.platform == PlatformEnum.XHS
assert called_request.max_notes_count == 50
assert called_request.max_comments_count == 5
def test_api_start_crawler_without_limits():
client = TestClient(app)
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
mock_start.return_value = True
# Test case 2: without limits
response = client.post("/api/crawler/start", json={
"platform": "xhs",
"login_type": "qrcode",
"crawler_type": "search",
"keywords": "test"
})
assert response.status_code == 200
mock_start.assert_called_once()
called_request = mock_start.call_args[0][0]
assert called_request.platform == PlatformEnum.XHS
assert called_request.max_notes_count is None
assert called_request.max_comments_count is None