mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-30 22:47:28 +08:00
The PR adds API limit overrides and static proxy support, but the review found that the default proxy provider changed to an invalid static placeholder and the new API fields accepted unbounded values. This keeps the existing proxy default intact, makes static proxy explicit via config or CLI, validates API limit ranges, and adds focused regression coverage for both paths. Constraint: PR branch must remain contributor-branch compatible and avoid adding dependencies Rejected: Keep static as the default provider | breaks existing --enable_ip_proxy defaults with an invalid placeholder URL Rejected: Accept arbitrary integer limits | lets API callers request negative or excessive crawl sizes Confidence: high Scope-risk: narrow Directive: Do not change proxy provider defaults when adding new providers; new providers should be opt-in and covered by provider-specific tests Tested: uv run pytest tests/test_api_limits.py tests/test_static_proxy_provider.py Tested: uv run pytest tests Tested: uv run pytest test/test_utils.py Tested: uv run python -m compileall api cmd_arg config proxy tests Tested: git diff --cached --check Not-tested: Live crawler run against external platforms or real proxy vendor endpoints
138 lines
4.6 KiB
Python
138 lines
4.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
import pytest
|
|
import config
|
|
from unittest.mock import AsyncMock, patch
|
|
from fastapi.testclient import TestClient
|
|
from cmd_arg import parse_cmd
|
|
from api.schemas import CrawlerStartRequest, PlatformEnum, LoginTypeEnum, CrawlerTypeEnum
|
|
from api.services.crawler_manager import CrawlerManager
|
|
from api.main import app
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cmd_arg_crawler_max_notes_count():
|
|
# Store original values
|
|
orig_notes = config.CRAWLER_MAX_NOTES_COUNT
|
|
orig_comments = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
|
|
|
try:
|
|
await parse_cmd([
|
|
"--platform", "xhs",
|
|
"--crawler_max_notes_count", "42",
|
|
"--max_comments_count_singlenotes", "24"
|
|
])
|
|
assert config.CRAWLER_MAX_NOTES_COUNT == 42
|
|
assert config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES == 24
|
|
finally:
|
|
config.CRAWLER_MAX_NOTES_COUNT = orig_notes
|
|
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = orig_comments
|
|
|
|
def test_crawler_manager_build_command():
|
|
cm = CrawlerManager()
|
|
|
|
# 1. No max limits passed in API request
|
|
req1 = CrawlerStartRequest(
|
|
platform=PlatformEnum.XHS,
|
|
login_type=LoginTypeEnum.QRCODE,
|
|
crawler_type=CrawlerTypeEnum.SEARCH,
|
|
keywords="test",
|
|
max_notes_count=None,
|
|
max_comments_count=None
|
|
)
|
|
cmd1 = cm._build_command(req1)
|
|
# Check that the custom arguments are NOT present
|
|
assert "--crawler_max_notes_count" not in cmd1
|
|
assert "--max_comments_count_singlenotes" not in cmd1
|
|
|
|
# 2. Both limits passed in API request
|
|
req2 = CrawlerStartRequest(
|
|
platform=PlatformEnum.XHS,
|
|
login_type=LoginTypeEnum.QRCODE,
|
|
crawler_type=CrawlerTypeEnum.SEARCH,
|
|
keywords="test",
|
|
max_notes_count=50,
|
|
max_comments_count=5
|
|
)
|
|
cmd2 = cm._build_command(req2)
|
|
# Check that they are correctly added
|
|
assert "--crawler_max_notes_count" in cmd2
|
|
idx_notes = cmd2.index("--crawler_max_notes_count")
|
|
assert cmd2[idx_notes + 1] == "50"
|
|
|
|
assert "--max_comments_count_singlenotes" in cmd2
|
|
idx_comments = cmd2.index("--max_comments_count_singlenotes")
|
|
assert cmd2[idx_comments + 1] == "5"
|
|
|
|
def test_api_start_crawler_with_limits():
|
|
client = TestClient(app)
|
|
|
|
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
|
mock_start.return_value = True
|
|
|
|
# Test case 1: with limits
|
|
response = client.post("/api/crawler/start", json={
|
|
"platform": "xhs",
|
|
"login_type": "qrcode",
|
|
"crawler_type": "search",
|
|
"keywords": "test",
|
|
"max_notes_count": 50,
|
|
"max_comments_count": 5
|
|
})
|
|
|
|
assert response.status_code == 200
|
|
assert response.json() == {"status": "ok", "message": "Crawler started successfully"}
|
|
|
|
mock_start.assert_called_once()
|
|
called_request = mock_start.call_args[0][0]
|
|
assert called_request.platform == PlatformEnum.XHS
|
|
assert called_request.max_notes_count == 50
|
|
assert called_request.max_comments_count == 5
|
|
|
|
def test_api_start_crawler_without_limits():
|
|
client = TestClient(app)
|
|
|
|
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
|
mock_start.return_value = True
|
|
|
|
# Test case 2: without limits
|
|
response = client.post("/api/crawler/start", json={
|
|
"platform": "xhs",
|
|
"login_type": "qrcode",
|
|
"crawler_type": "search",
|
|
"keywords": "test"
|
|
})
|
|
|
|
assert response.status_code == 200
|
|
mock_start.assert_called_once()
|
|
called_request = mock_start.call_args[0][0]
|
|
assert called_request.platform == PlatformEnum.XHS
|
|
assert called_request.max_notes_count is None
|
|
assert called_request.max_comments_count is None
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("field_name", "value"),
|
|
[
|
|
("max_notes_count", 0),
|
|
("max_notes_count", -1),
|
|
("max_notes_count", 10001),
|
|
("max_comments_count", 0),
|
|
("max_comments_count", -1),
|
|
("max_comments_count", 10001),
|
|
],
|
|
)
|
|
def test_api_rejects_invalid_limits(field_name, value):
|
|
client = TestClient(app)
|
|
payload = {
|
|
"platform": "xhs",
|
|
"login_type": "qrcode",
|
|
"crawler_type": "search",
|
|
"keywords": "test",
|
|
field_name: value,
|
|
}
|
|
|
|
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
|
|
response = client.post("/api/crawler/start", json=payload)
|
|
|
|
assert response.status_code == 422
|
|
mock_start.assert_not_called()
|