Keep PR 900 overrides bounded and opt-in

The PR adds API limit overrides and static proxy support, but the review found that the default proxy provider changed to an invalid static placeholder and the new API fields accepted unbounded values. This keeps the existing proxy default intact, makes static proxy explicit via config or CLI, validates API limit ranges, and adds focused regression coverage for both paths.

Constraint: PR branch must remain contributor-branch compatible and avoid adding dependencies

Rejected: Keep static as the default provider | breaks existing --enable_ip_proxy defaults with an invalid placeholder URL

Rejected: Accept arbitrary integer limits | lets API callers request negative or excessive crawl sizes

Confidence: high

Scope-risk: narrow

Directive: Do not change proxy provider defaults when adding new providers; new providers should be opt-in and covered by provider-specific tests

Tested: uv run pytest tests/test_api_limits.py tests/test_static_proxy_provider.py

Tested: uv run pytest tests

Tested: uv run pytest test/test_utils.py

Tested: uv run python -m compileall api cmd_arg config proxy tests

Tested: git diff --cached --check

Not-tested: Live crawler run against external platforms or real proxy vendor endpoints
This commit is contained in:
程序员阿江(Relakkes)
2026-05-29 21:27:52 +08:00
parent f997befce9
commit 8e93438fe5
7 changed files with 126 additions and 34 deletions

View File

@@ -18,7 +18,10 @@
from enum import Enum from enum import Enum
from typing import Optional, Literal from typing import Optional, Literal
from pydantic import BaseModel from pydantic import BaseModel, Field
MAX_API_LIMIT_COUNT = 10000
class PlatformEnum(str, Enum): class PlatformEnum(str, Enum):
@@ -71,8 +74,8 @@ class CrawlerStartRequest(BaseModel):
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL
cookies: str = "" cookies: str = ""
headless: bool = False headless: bool = False
max_notes_count: Optional[int] = None max_notes_count: Optional[int] = Field(default=None, ge=1, le=MAX_API_LIMIT_COUNT)
max_comments_count: Optional[int] = None max_comments_count: Optional[int] = Field(default=None, ge=1, le=MAX_API_LIMIT_COUNT)
class CrawlerStatusResponse(BaseModel): class CrawlerStatusResponse(BaseModel):

View File

@@ -320,10 +320,18 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
str, str,
typer.Option( typer.Option(
"--ip_proxy_provider_name", "--ip_proxy_provider_name",
help="IP proxy provider name (kuaidaili | wandouhttp)", help="IP proxy provider name (kuaidaili | wandouhttp | static)",
rich_help_panel="Proxy Configuration", rich_help_panel="Proxy Configuration",
), ),
] = config.IP_PROXY_PROVIDER_NAME, ] = config.IP_PROXY_PROVIDER_NAME,
static_proxy_url: Annotated[
str,
typer.Option(
"--static_proxy_url",
help="Static proxy URL, for example http://user:password@host:port",
rich_help_panel="Proxy Configuration",
),
] = config.STATIC_PROXY_URL,
) -> SimpleNamespace: ) -> SimpleNamespace:
"""MediaCrawler 命令行入口""" """MediaCrawler 命令行入口"""
@@ -356,6 +364,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.ENABLE_IP_PROXY = enable_ip_proxy_value config.ENABLE_IP_PROXY = enable_ip_proxy_value
config.IP_PROXY_POOL_COUNT = ip_proxy_pool_count config.IP_PROXY_POOL_COUNT = ip_proxy_pool_count
config.IP_PROXY_PROVIDER_NAME = ip_proxy_provider_name config.IP_PROXY_PROVIDER_NAME = ip_proxy_provider_name
config.STATIC_PROXY_URL = static_proxy_url
# Set platform-specific ID lists for detail/creator mode # Set platform-specific ID lists for detail/creator mode
if specified_id_list: if specified_id_list:

View File

@@ -34,14 +34,14 @@ CRAWLER_TYPE = (
ENABLE_IP_PROXY = False ENABLE_IP_PROXY = False
# Number of proxy IP pools # Number of proxy IP pools
IP_PROXY_POOL_COUNT = 1 IP_PROXY_POOL_COUNT = 2
# Proxy IP provider name # Proxy IP provider name
IP_PROXY_PROVIDER_NAME = "static" # kuaidaili | wandouhttp | static IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp | static
# Static proxy configuration (used when IP_PROXY_PROVIDER_NAME is set to "static") # Static proxy configuration (used when IP_PROXY_PROVIDER_NAME is set to "static")
# Format: "http://your_home_domain:port" or "http://user:password@your_home_domain:port" # Format: "http://your_home_domain:port" or "http://user:password@your_home_domain:port"
STATIC_PROXY_URL = "http://your_home_domain:port" STATIC_PROXY_URL = ""
# Setting to True will not open the browser (headless browser) # Setting to True will not open the browser (headless browser)
# Setting False will open a browser # Setting False will open a browser

View File

@@ -22,7 +22,9 @@
# @Time : 2023/12/2 13:45 # @Time : 2023/12/2 13:45
# @Desc : IP proxy pool implementation # @Desc : IP proxy pool implementation
import random import random
import time
from typing import Dict, List from typing import Dict, List
from urllib.parse import unquote, urlparse
import httpx import httpx
from tenacity import retry, stop_after_attempt, wait_fixed from tenacity import retry, stop_after_attempt, wait_fixed
@@ -152,9 +154,6 @@ class ProxyIpPool:
class StaticProxyProvider(ProxyProvider): class StaticProxyProvider(ProxyProvider):
async def get_proxy(self, num: int) -> List[IpInfoModel]: async def get_proxy(self, num: int) -> List[IpInfoModel]:
from urllib.parse import urlparse
import time
proxy_url = getattr(config, "STATIC_PROXY_URL", "") proxy_url = getattr(config, "STATIC_PROXY_URL", "")
if not proxy_url: if not proxy_url:
utils.logger.warning("[StaticProxyProvider] STATIC_PROXY_URL is not configured!") utils.logger.warning("[StaticProxyProvider] STATIC_PROXY_URL is not configured!")
@@ -162,23 +161,26 @@ class StaticProxyProvider(ProxyProvider):
try: try:
parsed = urlparse(proxy_url) parsed = urlparse(proxy_url)
ip = parsed.hostname or "" scheme = parsed.scheme or "http"
port = parsed.port or 80 if scheme not in {"http", "https"}:
user = parsed.username or "" utils.logger.error(f"[StaticProxyProvider] Unsupported proxy scheme: {scheme}")
password = parsed.password or "" return []
protocol = parsed.scheme + "://" if parsed.scheme else "http://"
# Static proxy doesn't expire ip = parsed.hostname or ""
expired_time_ts = int(time.time()) + 99999999 port = parsed.port or (443 if scheme == "https" else 80)
if not ip:
utils.logger.error("[StaticProxyProvider] STATIC_PROXY_URL host is empty!")
return []
return [ return [
IpInfoModel( IpInfoModel(
ip=ip, ip=ip,
port=port, port=port,
user=user, user=unquote(parsed.username or ""),
password=password, password=unquote(parsed.password or ""),
protocol=protocol, protocol=f"{scheme}://",
expired_time_ts=expired_time_ts # Static proxy doesn't expire.
expired_time_ts=int(time.time()) + 99999999,
) )
] ]
except Exception as e: except Exception as e:
@@ -189,7 +191,7 @@ class StaticProxyProvider(ProxyProvider):
IpProxyProvider: Dict[str, ProxyProvider] = { IpProxyProvider: Dict[str, ProxyProvider] = {
ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(), ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(),
ProviderNameEnum.WANDOU_HTTP_PROVIDER.value: new_wandou_http_proxy(), ProviderNameEnum.WANDOU_HTTP_PROVIDER.value: new_wandou_http_proxy(),
"static": StaticProxyProvider(), ProviderNameEnum.STATIC_PROVIDER.value: StaticProxyProvider(),
} }
@@ -200,7 +202,7 @@ async def create_ip_pool(ip_pool_count: int, enable_validate_ip: bool) -> ProxyI
:param enable_validate_ip: Whether to enable IP proxy validation :param enable_validate_ip: Whether to enable IP proxy validation
:return: :return:
""" """
is_static = config.IP_PROXY_PROVIDER_NAME == "static" is_static = config.IP_PROXY_PROVIDER_NAME == ProviderNameEnum.STATIC_PROVIDER.value
pool = ProxyIpPool( pool = ProxyIpPool(
ip_pool_count=ip_pool_count, ip_pool_count=ip_pool_count,
enable_validate_ip=False if is_static else enable_validate_ip, enable_validate_ip=False if is_static else enable_validate_ip,

View File

@@ -32,6 +32,7 @@ from pydantic import BaseModel, Field
class ProviderNameEnum(Enum): class ProviderNameEnum(Enum):
KUAI_DAILI_PROVIDER: str = "kuaidaili" KUAI_DAILI_PROVIDER: str = "kuaidaili"
WANDOU_HTTP_PROVIDER: str = "wandouhttp" WANDOU_HTTP_PROVIDER: str = "wandouhttp"
STATIC_PROVIDER: str = "static"
class IpInfoModel(BaseModel): class IpInfoModel(BaseModel):

View File

@@ -107,3 +107,31 @@ def test_api_start_crawler_without_limits():
assert called_request.platform == PlatformEnum.XHS assert called_request.platform == PlatformEnum.XHS
assert called_request.max_notes_count is None assert called_request.max_notes_count is None
assert called_request.max_comments_count is None assert called_request.max_comments_count is None
@pytest.mark.parametrize(
("field_name", "value"),
[
("max_notes_count", 0),
("max_notes_count", -1),
("max_notes_count", 10001),
("max_comments_count", 0),
("max_comments_count", -1),
("max_comments_count", 10001),
],
)
def test_api_rejects_invalid_limits(field_name, value):
client = TestClient(app)
payload = {
"platform": "xhs",
"login_type": "qrcode",
"crawler_type": "search",
"keywords": "test",
field_name: value,
}
with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start:
response = client.post("/api/crawler/start", json=payload)
assert response.status_code == 422
mock_start.assert_not_called()

View File

@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
import pytest
import config
from proxy.proxy_ip_pool import StaticProxyProvider, create_ip_pool
from proxy.types import ProviderNameEnum
def test_default_proxy_provider_remains_existing_provider():
assert config.IP_PROXY_PROVIDER_NAME == ProviderNameEnum.KUAI_DAILI_PROVIDER.value
assert config.IP_PROXY_POOL_COUNT == 2
assert config.STATIC_PROXY_URL == ""
@pytest.mark.asyncio
async def test_static_proxy_provider_parses_proxy_url(monkeypatch):
monkeypatch.setattr(config, "STATIC_PROXY_URL", "http://user:p%40ss@example.com:8080")
proxies = await StaticProxyProvider().get_proxy(1)
assert len(proxies) == 1
proxy = proxies[0]
assert proxy.ip == "example.com"
assert proxy.port == 8080
assert proxy.user == "user"
assert proxy.password == "p@ss"
assert proxy.protocol == "http://"
assert proxy.expired_time_ts is not None
@pytest.mark.asyncio
async def test_static_proxy_provider_rejects_invalid_url(monkeypatch):
monkeypatch.setattr(config, "STATIC_PROXY_URL", "http://your_home_domain:port")
proxies = await StaticProxyProvider().get_proxy(1)
assert proxies == []
@pytest.mark.asyncio
async def test_static_proxy_pool_disables_validation(monkeypatch):
monkeypatch.setattr(config, "IP_PROXY_PROVIDER_NAME", ProviderNameEnum.STATIC_PROVIDER.value)
monkeypatch.setattr(config, "STATIC_PROXY_URL", "https://example.com:8443")
pool = await create_ip_pool(ip_pool_count=2, enable_validate_ip=True)
assert pool.enable_validate_ip is False
assert len(pool.proxy_list) == 1
assert pool.proxy_list[0].protocol == "https://"