mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-01 15:37:26 +08:00
feat: 启动任务接口添加帖子/视频数量与评论数量覆盖支持
This commit is contained in:
@@ -71,6 +71,8 @@ class CrawlerStartRequest(BaseModel):
|
|||||||
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL
|
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL
|
||||||
cookies: str = ""
|
cookies: str = ""
|
||||||
headless: bool = False
|
headless: bool = False
|
||||||
|
max_notes_count: Optional[int] = None
|
||||||
|
max_comments_count: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
class CrawlerStatusResponse(BaseModel):
|
class CrawlerStatusResponse(BaseModel):
|
||||||
|
|||||||
@@ -225,6 +225,12 @@ class CrawlerManager:
|
|||||||
cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
|
cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
|
||||||
cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
|
cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
|
||||||
|
|
||||||
|
if config.max_notes_count is not None:
|
||||||
|
cmd.extend(["--crawler_max_notes_count", str(config.max_notes_count)])
|
||||||
|
|
||||||
|
if config.max_comments_count is not None:
|
||||||
|
cmd.extend(["--max_comments_count_singlenotes", str(config.max_comments_count)])
|
||||||
|
|
||||||
if config.cookies:
|
if config.cookies:
|
||||||
cmd.extend(["--cookies", config.cookies])
|
cmd.extend(["--cookies", config.cookies])
|
||||||
|
|
||||||
|
|||||||
@@ -275,6 +275,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
|||||||
rich_help_panel="Comment Configuration",
|
rich_help_panel="Comment Configuration",
|
||||||
),
|
),
|
||||||
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||||
|
crawler_max_notes_count: Annotated[
|
||||||
|
int,
|
||||||
|
typer.Option(
|
||||||
|
"--crawler_max_notes_count",
|
||||||
|
help="Maximum number of videos/posts to crawl",
|
||||||
|
rich_help_panel="Basic Configuration",
|
||||||
|
),
|
||||||
|
] = config.CRAWLER_MAX_NOTES_COUNT,
|
||||||
max_concurrency_num: Annotated[
|
max_concurrency_num: Annotated[
|
||||||
int,
|
int,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@@ -342,6 +350,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
|||||||
config.SAVE_DATA_OPTION = save_data_option.value
|
config.SAVE_DATA_OPTION = save_data_option.value
|
||||||
config.COOKIES = cookies
|
config.COOKIES = cookies
|
||||||
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
|
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
|
||||||
|
config.CRAWLER_MAX_NOTES_COUNT = crawler_max_notes_count
|
||||||
config.MAX_CONCURRENCY_NUM = max_concurrency_num
|
config.MAX_CONCURRENCY_NUM = max_concurrency_num
|
||||||
config.SAVE_DATA_PATH = save_data_path
|
config.SAVE_DATA_PATH = save_data_path
|
||||||
config.ENABLE_IP_PROXY = enable_ip_proxy_value
|
config.ENABLE_IP_PROXY = enable_ip_proxy_value
|
||||||
|
|||||||
109
tests/test_api_limits.py
Normal file
109
tests/test_api_limits.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import pytest
|
||||||
|
import config
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from cmd_arg import parse_cmd
|
||||||
|
from api.schemas import CrawlerStartRequest, PlatformEnum, LoginTypeEnum, CrawlerTypeEnum
|
||||||
|
from api.services.crawler_manager import CrawlerManager
|
||||||
|
from api.main import app
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cmd_arg_crawler_max_notes_count():
|
||||||
|
# Store original values
|
||||||
|
orig_notes = config.CRAWLER_MAX_NOTES_COUNT
|
||||||
|
orig_comments = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
|
|
||||||
|
try:
|
||||||
|
await parse_cmd([
|
||||||
|
"--platform", "xhs",
|
||||||
|
"--crawler_max_notes_count", "42",
|
||||||
|
"--max_comments_count_singlenotes", "24"
|
||||||
|
])
|
||||||
|
assert config.CRAWLER_MAX_NOTES_COUNT == 42
|
||||||
|
assert config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES == 24
|
||||||
|
finally:
|
||||||
|
config.CRAWLER_MAX_NOTES_COUNT = orig_notes
|
||||||
|
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = orig_comments
|
||||||
|
|
||||||
|
def test_crawler_manager_build_command():
|
||||||
|
cm = CrawlerManager()
|
||||||
|
|
||||||
|
# 1. No max limits passed in API request
|
||||||
|
req1 = CrawlerStartRequest(
|
||||||
|
platform=PlatformEnum.XHS,
|
||||||
|
login_type=LoginTypeEnum.QRCODE,
|
||||||
|
crawler_type=CrawlerTypeEnum.SEARCH,
|
||||||
|
keywords="test",
|
||||||
|
max_notes_count=None,
|
||||||
|
max_comments_count=None
|
||||||
|
)
|
||||||
|
cmd1 = cm._build_command(req1)
|
||||||
|
# Check that the custom arguments are NOT present
|
||||||
|
assert "--crawler_max_notes_count" not in cmd1
|
||||||
|
assert "--max_comments_count_singlenotes" not in cmd1
|
||||||
|
|
||||||
|
# 2. Both limits passed in API request
|
||||||
|
req2 = CrawlerStartRequest(
|
||||||
|
platform=PlatformEnum.XHS,
|
||||||
|
login_type=LoginTypeEnum.QRCODE,
|
||||||
|
crawler_type=CrawlerTypeEnum.SEARCH,
|
||||||
|
keywords="test",
|
||||||
|
max_notes_count=50,
|
||||||
|
max_comments_count=5
|
||||||
|
)
|
||||||
|
cmd2 = cm._build_command(req2)
|
||||||
|
# Check that they are correctly added
|
||||||
|
assert "--crawler_max_notes_count" in cmd2
|
||||||
|
idx_notes = cmd2.index("--crawler_max_notes_count")
|
||||||
|
assert cmd2[idx_notes + 1] == "50"
|
||||||
|
|
||||||
|
assert "--max_comments_count_singlenotes" in cmd2
|
||||||
|
idx_comments = cmd2.index("--max_comments_count_singlenotes")
|
||||||
|
assert cmd2[idx_comments + 1] == "5"
|
||||||
|
|
||||||
|
def test_api_start_crawler_with_limits():
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
||||||
|
mock_start.return_value = True
|
||||||
|
|
||||||
|
# Test case 1: with limits
|
||||||
|
response = client.post("/api/crawler/start", json={
|
||||||
|
"platform": "xhs",
|
||||||
|
"login_type": "qrcode",
|
||||||
|
"crawler_type": "search",
|
||||||
|
"keywords": "test",
|
||||||
|
"max_notes_count": 50,
|
||||||
|
"max_comments_count": 5
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json() == {"status": "ok", "message": "Crawler started successfully"}
|
||||||
|
|
||||||
|
mock_start.assert_called_once()
|
||||||
|
called_request = mock_start.call_args[0][0]
|
||||||
|
assert called_request.platform == PlatformEnum.XHS
|
||||||
|
assert called_request.max_notes_count == 50
|
||||||
|
assert called_request.max_comments_count == 5
|
||||||
|
|
||||||
|
def test_api_start_crawler_without_limits():
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
||||||
|
mock_start.return_value = True
|
||||||
|
|
||||||
|
# Test case 2: without limits
|
||||||
|
response = client.post("/api/crawler/start", json={
|
||||||
|
"platform": "xhs",
|
||||||
|
"login_type": "qrcode",
|
||||||
|
"crawler_type": "search",
|
||||||
|
"keywords": "test"
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
mock_start.assert_called_once()
|
||||||
|
called_request = mock_start.call_args[0][0]
|
||||||
|
assert called_request.platform == PlatformEnum.XHS
|
||||||
|
assert called_request.max_notes_count is None
|
||||||
|
assert called_request.max_comments_count is None
|
||||||
Reference in New Issue
Block a user