mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-31 15:07:28 +08:00
feat: 启动任务接口添加帖子/视频数量与评论数量覆盖支持
This commit is contained in:
@@ -71,6 +71,8 @@ class CrawlerStartRequest(BaseModel):
|
||||
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL
|
||||
cookies: str = ""
|
||||
headless: bool = False
|
||||
max_notes_count: Optional[int] = None
|
||||
max_comments_count: Optional[int] = None
|
||||
|
||||
|
||||
class CrawlerStatusResponse(BaseModel):
|
||||
|
||||
@@ -225,6 +225,12 @@ class CrawlerManager:
|
||||
cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
|
||||
cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
|
||||
|
||||
if config.max_notes_count is not None:
|
||||
cmd.extend(["--crawler_max_notes_count", str(config.max_notes_count)])
|
||||
|
||||
if config.max_comments_count is not None:
|
||||
cmd.extend(["--max_comments_count_singlenotes", str(config.max_comments_count)])
|
||||
|
||||
if config.cookies:
|
||||
cmd.extend(["--cookies", config.cookies])
|
||||
|
||||
|
||||
@@ -275,6 +275,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
rich_help_panel="Comment Configuration",
|
||||
),
|
||||
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
crawler_max_notes_count: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--crawler_max_notes_count",
|
||||
help="Maximum number of videos/posts to crawl",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = config.CRAWLER_MAX_NOTES_COUNT,
|
||||
max_concurrency_num: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
@@ -342,6 +350,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
config.SAVE_DATA_OPTION = save_data_option.value
|
||||
config.COOKIES = cookies
|
||||
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
|
||||
config.CRAWLER_MAX_NOTES_COUNT = crawler_max_notes_count
|
||||
config.MAX_CONCURRENCY_NUM = max_concurrency_num
|
||||
config.SAVE_DATA_PATH = save_data_path
|
||||
config.ENABLE_IP_PROXY = enable_ip_proxy_value
|
||||
|
||||
109
tests/test_api_limits.py
Normal file
109
tests/test_api_limits.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
import config
|
||||
from unittest.mock import AsyncMock, patch
|
||||
from fastapi.testclient import TestClient
|
||||
from cmd_arg import parse_cmd
|
||||
from api.schemas import CrawlerStartRequest, PlatformEnum, LoginTypeEnum, CrawlerTypeEnum
|
||||
from api.services.crawler_manager import CrawlerManager
|
||||
from api.main import app
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cmd_arg_crawler_max_notes_count():
|
||||
# Store original values
|
||||
orig_notes = config.CRAWLER_MAX_NOTES_COUNT
|
||||
orig_comments = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||
|
||||
try:
|
||||
await parse_cmd([
|
||||
"--platform", "xhs",
|
||||
"--crawler_max_notes_count", "42",
|
||||
"--max_comments_count_singlenotes", "24"
|
||||
])
|
||||
assert config.CRAWLER_MAX_NOTES_COUNT == 42
|
||||
assert config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES == 24
|
||||
finally:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = orig_notes
|
||||
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = orig_comments
|
||||
|
||||
def test_crawler_manager_build_command():
|
||||
cm = CrawlerManager()
|
||||
|
||||
# 1. No max limits passed in API request
|
||||
req1 = CrawlerStartRequest(
|
||||
platform=PlatformEnum.XHS,
|
||||
login_type=LoginTypeEnum.QRCODE,
|
||||
crawler_type=CrawlerTypeEnum.SEARCH,
|
||||
keywords="test",
|
||||
max_notes_count=None,
|
||||
max_comments_count=None
|
||||
)
|
||||
cmd1 = cm._build_command(req1)
|
||||
# Check that the custom arguments are NOT present
|
||||
assert "--crawler_max_notes_count" not in cmd1
|
||||
assert "--max_comments_count_singlenotes" not in cmd1
|
||||
|
||||
# 2. Both limits passed in API request
|
||||
req2 = CrawlerStartRequest(
|
||||
platform=PlatformEnum.XHS,
|
||||
login_type=LoginTypeEnum.QRCODE,
|
||||
crawler_type=CrawlerTypeEnum.SEARCH,
|
||||
keywords="test",
|
||||
max_notes_count=50,
|
||||
max_comments_count=5
|
||||
)
|
||||
cmd2 = cm._build_command(req2)
|
||||
# Check that they are correctly added
|
||||
assert "--crawler_max_notes_count" in cmd2
|
||||
idx_notes = cmd2.index("--crawler_max_notes_count")
|
||||
assert cmd2[idx_notes + 1] == "50"
|
||||
|
||||
assert "--max_comments_count_singlenotes" in cmd2
|
||||
idx_comments = cmd2.index("--max_comments_count_singlenotes")
|
||||
assert cmd2[idx_comments + 1] == "5"
|
||||
|
||||
def test_api_start_crawler_with_limits():
|
||||
client = TestClient(app)
|
||||
|
||||
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
||||
mock_start.return_value = True
|
||||
|
||||
# Test case 1: with limits
|
||||
response = client.post("/api/crawler/start", json={
|
||||
"platform": "xhs",
|
||||
"login_type": "qrcode",
|
||||
"crawler_type": "search",
|
||||
"keywords": "test",
|
||||
"max_notes_count": 50,
|
||||
"max_comments_count": 5
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {"status": "ok", "message": "Crawler started successfully"}
|
||||
|
||||
mock_start.assert_called_once()
|
||||
called_request = mock_start.call_args[0][0]
|
||||
assert called_request.platform == PlatformEnum.XHS
|
||||
assert called_request.max_notes_count == 50
|
||||
assert called_request.max_comments_count == 5
|
||||
|
||||
def test_api_start_crawler_without_limits():
|
||||
client = TestClient(app)
|
||||
|
||||
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
||||
mock_start.return_value = True
|
||||
|
||||
# Test case 2: without limits
|
||||
response = client.post("/api/crawler/start", json={
|
||||
"platform": "xhs",
|
||||
"login_type": "qrcode",
|
||||
"crawler_type": "search",
|
||||
"keywords": "test"
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
mock_start.assert_called_once()
|
||||
called_request = mock_start.call_args[0][0]
|
||||
assert called_request.platform == PlatformEnum.XHS
|
||||
assert called_request.max_notes_count is None
|
||||
assert called_request.max_comments_count is None
|
||||
Reference in New Issue
Block a user