diff --git a/api/schemas/crawler.py b/api/schemas/crawler.py index f31ef82..283cf2d 100644 --- a/api/schemas/crawler.py +++ b/api/schemas/crawler.py @@ -71,6 +71,8 @@ class CrawlerStartRequest(BaseModel): save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL cookies: str = "" headless: bool = False + max_notes_count: Optional[int] = None + max_comments_count: Optional[int] = None class CrawlerStatusResponse(BaseModel): diff --git a/api/services/crawler_manager.py b/api/services/crawler_manager.py index f0fb228..9af954b 100644 --- a/api/services/crawler_manager.py +++ b/api/services/crawler_manager.py @@ -225,6 +225,12 @@ class CrawlerManager: cmd.extend(["--get_comment", "true" if config.enable_comments else "false"]) cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"]) + if config.max_notes_count is not None: + cmd.extend(["--crawler_max_notes_count", str(config.max_notes_count)]) + + if config.max_comments_count is not None: + cmd.extend(["--max_comments_count_singlenotes", str(config.max_comments_count)]) + if config.cookies: cmd.extend(["--cookies", config.cookies]) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 86199db..b18b09e 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -275,6 +275,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Comment Configuration", ), ] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + crawler_max_notes_count: Annotated[ + int, + typer.Option( + "--crawler_max_notes_count", + help="Maximum number of videos/posts to crawl", + rich_help_panel="Basic Configuration", + ), + ] = config.CRAWLER_MAX_NOTES_COUNT, max_concurrency_num: Annotated[ int, typer.Option( @@ -342,6 +350,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.SAVE_DATA_OPTION = save_data_option.value config.COOKIES = cookies config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes + config.CRAWLER_MAX_NOTES_COUNT = crawler_max_notes_count config.MAX_CONCURRENCY_NUM = max_concurrency_num config.SAVE_DATA_PATH = save_data_path config.ENABLE_IP_PROXY = enable_ip_proxy_value diff --git a/tests/test_api_limits.py b/tests/test_api_limits.py new file mode 100644 index 0000000..3154684 --- /dev/null +++ b/tests/test_api_limits.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +import pytest +import config +from unittest.mock import AsyncMock, patch +from fastapi.testclient import TestClient +from cmd_arg import parse_cmd +from api.schemas import CrawlerStartRequest, PlatformEnum, LoginTypeEnum, CrawlerTypeEnum +from api.services.crawler_manager import CrawlerManager +from api.main import app + +@pytest.mark.asyncio +async def test_cmd_arg_crawler_max_notes_count(): + # Store original values + orig_notes = config.CRAWLER_MAX_NOTES_COUNT + orig_comments = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES + + try: + await parse_cmd([ + "--platform", "xhs", + "--crawler_max_notes_count", "42", + "--max_comments_count_singlenotes", "24" + ]) + assert config.CRAWLER_MAX_NOTES_COUNT == 42 + assert config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES == 24 + finally: + config.CRAWLER_MAX_NOTES_COUNT = orig_notes + config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = orig_comments + +def test_crawler_manager_build_command(): + cm = CrawlerManager() + + # 1. No max limits passed in API request + req1 = CrawlerStartRequest( + platform=PlatformEnum.XHS, + login_type=LoginTypeEnum.QRCODE, + crawler_type=CrawlerTypeEnum.SEARCH, + keywords="test", + max_notes_count=None, + max_comments_count=None + ) + cmd1 = cm._build_command(req1) + # Check that the custom arguments are NOT present + assert "--crawler_max_notes_count" not in cmd1 + assert "--max_comments_count_singlenotes" not in cmd1 + + # 2. Both limits passed in API request + req2 = CrawlerStartRequest( + platform=PlatformEnum.XHS, + login_type=LoginTypeEnum.QRCODE, + crawler_type=CrawlerTypeEnum.SEARCH, + keywords="test", + max_notes_count=50, + max_comments_count=5 + ) + cmd2 = cm._build_command(req2) + # Check that they are correctly added + assert "--crawler_max_notes_count" in cmd2 + idx_notes = cmd2.index("--crawler_max_notes_count") + assert cmd2[idx_notes + 1] == "50" + + assert "--max_comments_count_singlenotes" in cmd2 + idx_comments = cmd2.index("--max_comments_count_singlenotes") + assert cmd2[idx_comments + 1] == "5" + +def test_api_start_crawler_with_limits(): + client = TestClient(app) + + with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start: + mock_start.return_value = True + + # Test case 1: with limits + response = client.post("/api/crawler/start", json={ + "platform": "xhs", + "login_type": "qrcode", + "crawler_type": "search", + "keywords": "test", + "max_notes_count": 50, + "max_comments_count": 5 + }) + + assert response.status_code == 200 + assert response.json() == {"status": "ok", "message": "Crawler started successfully"} + + mock_start.assert_called_once() + called_request = mock_start.call_args[0][0] + assert called_request.platform == PlatformEnum.XHS + assert called_request.max_notes_count == 50 + assert called_request.max_comments_count == 5 + +def test_api_start_crawler_without_limits(): + client = TestClient(app) + + with patch("api.routers.crawler.crawler_manager.start", new_callable=AsyncMock) as mock_start: + mock_start.return_value = True + + # Test case 2: without limits + response = client.post("/api/crawler/start", json={ + "platform": "xhs", + "login_type": "qrcode", + "crawler_type": "search", + "keywords": "test" + }) + + assert response.status_code == 200 + mock_start.assert_called_once() + called_request = mock_start.call_args[0][0] + assert called_request.platform == PlatformEnum.XHS + assert called_request.max_notes_count is None + assert called_request.max_comments_count is None