feat(api): add WebUI API server with built frontend

- Add FastAPI server with WebSocket support for real-time logs - Add crawler management API endpoints (start/stop/status) - Add data browsing API endpoints (list files, preview, download) - Include pre-built WebUI assets for serving frontend API endpoints: - POST /api/crawler/start - Start crawler task - POST /api/crawler/stop - Stop crawler task - GET /api/crawler/status - Get crawler status - WS /api/ws/logs - Real-time log streaming - GET /api/data/files - List data files - GET /api/data/stats - Get data statistics 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-06-09 19:37:25 +08:00 · 2025-12-19 00:02:08 +08:00
parent eb66e57f60
commit 508675a251
20 changed files with 1467 additions and 1 deletions
--- a/api/init.py
+++ b/api/init.py
@@ -0,0 +1,19 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/__init__.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 # WebUI API Module for MediaCrawler
--- a/api/main.py
+++ b/api/main.py
@@ -0,0 +1,186 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/main.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 """
 MediaCrawler WebUI API Server
 启动命令: uvicorn api.main:app --port 8080 --reload
 或者: python -m api.main
 """
 import asyncio
 import os
 import subprocess
 import uvicorn
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
 from .routers import crawler_router, data_router, websocket_router
 app = FastAPI(
    title="MediaCrawler WebUI API",
    description="API for controlling MediaCrawler from WebUI",
    version="1.0.0"
 )
 # 获取 webui 静态文件目录
 WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui")
 # CORS 配置 - 允许前端开发服务器访问
 app.add_middleware(
    CORSMiddleware,
    allow_origins=[
        "http://localhost:5173",  # Vite dev server
        "http://localhost:3000",  # 备用端口
        "http://127.0.0.1:5173",
        "http://127.0.0.1:3000",
    ],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # 注册路由
 app.include_router(crawler_router, prefix="/api")
 app.include_router(data_router, prefix="/api")
 app.include_router(websocket_router, prefix="/api")
@app.get("/")
 async def serve_frontend():
    """返回前端页面"""
    index_path = os.path.join(WEBUI_DIR, "index.html")
    if os.path.exists(index_path):
        return FileResponse(index_path)
    return {
        "message": "MediaCrawler WebUI API",
        "version": "1.0.0",
        "docs": "/docs",
        "note": "WebUI not found, please build it first: cd webui && npm run build"
    }
@app.get("/api/health")
 async def health_check():
    return {"status": "ok"}
@app.get("/api/env/check")
 async def check_environment():
    """检测 MediaCrawler 环境是否配置正确"""
    try:
        # 运行 uv run main.py --help 命令检测环境
        process = await asyncio.create_subprocess_exec(
            "uv", "run", "main.py", "--help",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            cwd="."  # 项目根目录
        )
        stdout, stderr = await asyncio.wait_for(
            process.communicate(),
            timeout=30.0  # 30秒超时
        )
        if process.returncode == 0:
            return {
                "success": True,
                "message": "MediaCrawler 环境配置正确",
                "output": stdout.decode("utf-8", errors="ignore")[:500]  # 截取前500字符
            }
        else:
            error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore")
            return {
                "success": False,
                "message": "环境检测失败",
                "error": error_msg[:500]
            }
    except asyncio.TimeoutError:
        return {
            "success": False,
            "message": "环境检测超时",
            "error": "命令执行超过30秒"
        }
    except FileNotFoundError:
        return {
            "success": False,
            "message": "未找到 uv 命令",
            "error": "请确保已安装 uv 并配置到系统 PATH"
        }
    except Exception as e:
        return {
            "success": False,
            "message": "环境检测出错",
            "error": str(e)
        }
@app.get("/api/config/platforms")
 async def get_platforms():
    """获取支持的平台列表"""
    return {
        "platforms": [
            {"value": "xhs", "label": "小红书", "icon": "book-open"},
            {"value": "dy", "label": "抖音", "icon": "music"},
            {"value": "ks", "label": "快手", "icon": "video"},
            {"value": "bili", "label": "哔哩哔哩", "icon": "tv"},
            {"value": "wb", "label": "微博", "icon": "message-circle"},
            {"value": "tieba", "label": "百度贴吧", "icon": "messages-square"},
            {"value": "zhihu", "label": "知乎", "icon": "help-circle"},
        ]
    }
@app.get("/api/config/options")
 async def get_config_options():
    """获取所有配置选项"""
    return {
        "login_types": [
            {"value": "qrcode", "label": "二维码登录"},
            {"value": "cookie", "label": "Cookie登录"},
        ],
        "crawler_types": [
            {"value": "search", "label": "搜索模式"},
            {"value": "detail", "label": "详情模式"},
            {"value": "creator", "label": "创作者模式"},
        ],
        "save_options": [
            {"value": "json", "label": "JSON 文件"},
            {"value": "csv", "label": "CSV 文件"},
            {"value": "excel", "label": "Excel 文件"},
            {"value": "sqlite", "label": "SQLite 数据库"},
            {"value": "db", "label": "MySQL 数据库"},
            {"value": "mongodb", "label": "MongoDB 数据库"},
        ],
    }
 # 挂载静态资源 - 必须放在所有路由之后
 if os.path.exists(WEBUI_DIR):
    assets_dir = os.path.join(WEBUI_DIR, "assets")
    if os.path.exists(assets_dir):
        app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
    # 挂载 logos 目录
    logos_dir = os.path.join(WEBUI_DIR, "logos")
    if os.path.exists(logos_dir):
        app.mount("/logos", StaticFiles(directory=logos_dir), name="logos")
    # 挂载其他静态文件（如 vite.svg）
    app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static")
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8080)
--- a/api/routers/init.py
+++ b/api/routers/init.py
@@ -0,0 +1,23 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/__init__.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 from .crawler import router as crawler_router
 from .data import router as data_router
 from .websocket import router as websocket_router
 __all__ = ["crawler_router", "data_router", "websocket_router"]
--- a/api/routers/crawler.py
+++ b/api/routers/crawler.py
@@ -0,0 +1,63 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/crawler.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 from fastapi import APIRouter, HTTPException
 from ..schemas import CrawlerStartRequest, CrawlerStatusResponse
 from ..services import crawler_manager
 router = APIRouter(prefix="/crawler", tags=["crawler"])
@router.post("/start")
 async def start_crawler(request: CrawlerStartRequest):
    """启动爬虫任务"""
    success = await crawler_manager.start(request)
    if not success:
        # 处理并发/重复请求：如果进程已经在跑，返回 400 而不是 500
        if crawler_manager.process and crawler_manager.process.poll() is None:
            raise HTTPException(status_code=400, detail="Crawler is already running")
        raise HTTPException(status_code=500, detail="Failed to start crawler")
    return {"status": "ok", "message": "Crawler started successfully"}
@router.post("/stop")
 async def stop_crawler():
    """停止爬虫任务"""
    success = await crawler_manager.stop()
    if not success:
        # 处理并发/重复请求：如果进程已退出/不存在，返回 400 而不是 500
        if not crawler_manager.process or crawler_manager.process.poll() is not None:
            raise HTTPException(status_code=400, detail="No crawler is running")
        raise HTTPException(status_code=500, detail="Failed to stop crawler")
    return {"status": "ok", "message": "Crawler stopped successfully"}
@router.get("/status", response_model=CrawlerStatusResponse)
 async def get_crawler_status():
    """获取爬虫状态"""
    return crawler_manager.get_status()
@router.get("/logs")
 async def get_logs(limit: int = 100):
    """获取最近的日志"""
    logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
    return {"logs": [log.model_dump() for log in logs]}
--- a/api/routers/data.py
+++ b/api/routers/data.py
@@ -0,0 +1,230 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/data.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 import os
 import json
 from pathlib import Path
 from typing import Optional
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import FileResponse
 router = APIRouter(prefix="/data", tags=["data"])
 # 数据目录
 DATA_DIR = Path(__file__).parent.parent.parent / "data"
 def get_file_info(file_path: Path) -> dict:
    """获取文件信息"""
    stat = file_path.stat()
    record_count = None
    # 尝试获取记录数
    try:
        if file_path.suffix == ".json":
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                if isinstance(data, list):
                    record_count = len(data)
        elif file_path.suffix == ".csv":
            with open(file_path, "r", encoding="utf-8") as f:
                record_count = sum(1 for _ in f) - 1  # 减去标题行
    except Exception:
        pass
    return {
        "name": file_path.name,
        "path": str(file_path.relative_to(DATA_DIR)),
        "size": stat.st_size,
        "modified_at": stat.st_mtime,
        "record_count": record_count,
        "type": file_path.suffix[1:] if file_path.suffix else "unknown"
    }
@router.get("/files")
 async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
    """获取数据文件列表"""
    if not DATA_DIR.exists():
        return {"files": []}
    files = []
    supported_extensions = {".json", ".csv", ".xlsx", ".xls"}
    for root, dirs, filenames in os.walk(DATA_DIR):
        root_path = Path(root)
        for filename in filenames:
            file_path = root_path / filename
            if file_path.suffix.lower() not in supported_extensions:
                continue
            # 平台过滤
            if platform:
                rel_path = str(file_path.relative_to(DATA_DIR))
                if platform.lower() not in rel_path.lower():
                    continue
            # 类型过滤
            if file_type and file_path.suffix[1:].lower() != file_type.lower():
                continue
            try:
                files.append(get_file_info(file_path))
            except Exception:
                continue
    # 按修改时间排序（最新的在前）
    files.sort(key=lambda x: x["modified_at"], reverse=True)
    return {"files": files}
@router.get("/files/{file_path:path}")
 async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
    """获取文件内容或预览"""
    full_path = DATA_DIR / file_path
    if not full_path.exists():
        raise HTTPException(status_code=404, detail="File not found")
    if not full_path.is_file():
        raise HTTPException(status_code=400, detail="Not a file")
    # 安全检查：确保在 DATA_DIR 内
    try:
        full_path.resolve().relative_to(DATA_DIR.resolve())
    except ValueError:
        raise HTTPException(status_code=403, detail="Access denied")
    if preview:
        # 返回预览数据
        try:
            if full_path.suffix == ".json":
                with open(full_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    if isinstance(data, list):
                        return {"data": data[:limit], "total": len(data)}
                    return {"data": data, "total": 1}
            elif full_path.suffix == ".csv":
                import csv
                with open(full_path, "r", encoding="utf-8") as f:
                    reader = csv.DictReader(f)
                    rows = []
                    for i, row in enumerate(reader):
                        if i >= limit:
                            break
                        rows.append(row)
                    # 重新读取获取总数
                    f.seek(0)
                    total = sum(1 for _ in f) - 1
                    return {"data": rows, "total": total}
            elif full_path.suffix.lower() in (".xlsx", ".xls"):
                import pandas as pd
                # 读取前 limit 行
                df = pd.read_excel(full_path, nrows=limit)
                # 获取总行数（只读取第一列来节省内存）
                df_count = pd.read_excel(full_path, usecols=[0])
                total = len(df_count)
                # 转换为字典列表，处理 NaN 值
                rows = df.where(pd.notnull(df), None).to_dict(orient='records')
                return {
                    "data": rows,
                    "total": total,
                    "columns": list(df.columns)
                }
            else:
                raise HTTPException(status_code=400, detail="Unsupported file type for preview")
        except json.JSONDecodeError:
            raise HTTPException(status_code=400, detail="Invalid JSON file")
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    else:
        # 返回文件下载
        return FileResponse(
            path=full_path,
            filename=full_path.name,
            media_type="application/octet-stream"
        )
@router.get("/download/{file_path:path}")
 async def download_file(file_path: str):
    """下载文件"""
    full_path = DATA_DIR / file_path
    if not full_path.exists():
        raise HTTPException(status_code=404, detail="File not found")
    if not full_path.is_file():
        raise HTTPException(status_code=400, detail="Not a file")
    # 安全检查
    try:
        full_path.resolve().relative_to(DATA_DIR.resolve())
    except ValueError:
        raise HTTPException(status_code=403, detail="Access denied")
    return FileResponse(
        path=full_path,
        filename=full_path.name,
        media_type="application/octet-stream"
    )
@router.get("/stats")
 async def get_data_stats():
    """获取数据统计"""
    if not DATA_DIR.exists():
        return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}
    stats = {
        "total_files": 0,
        "total_size": 0,
        "by_platform": {},
        "by_type": {}
    }
    supported_extensions = {".json", ".csv", ".xlsx", ".xls"}
    for root, dirs, filenames in os.walk(DATA_DIR):
        root_path = Path(root)
        for filename in filenames:
            file_path = root_path / filename
            if file_path.suffix.lower() not in supported_extensions:
                continue
            try:
                stat = file_path.stat()
                stats["total_files"] += 1
                stats["total_size"] += stat.st_size
                # 按类型统计
                file_type = file_path.suffix[1:].lower()
                stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1
                # 按平台统计（从路径推断）
                rel_path = str(file_path.relative_to(DATA_DIR))
                for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
                    if platform in rel_path.lower():
                        stats["by_platform"][platform] = stats["by_platform"].get(platform, 0) + 1
                        break
            except Exception:
                continue
    return stats
--- a/api/routers/websocket.py
+++ b/api/routers/websocket.py
@@ -0,0 +1,151 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/websocket.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 import asyncio
 from typing import Set, Optional
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from ..services import crawler_manager
 router = APIRouter(tags=["websocket"])
 class ConnectionManager:
    """WebSocket 连接管理器"""
    def __init__(self):
        self.active_connections: Set[WebSocket] = set()
    async def connect(self, websocket: WebSocket):
        await websocket.accept()
        self.active_connections.add(websocket)
    def disconnect(self, websocket: WebSocket):
        self.active_connections.discard(websocket)
    async def broadcast(self, message: dict):
        """广播消息到所有连接"""
        if not self.active_connections:
            return
        disconnected = []
        for connection in list(self.active_connections):
            try:
                await connection.send_json(message)
            except Exception:
                disconnected.append(connection)
        # 清理断开的连接
        for conn in disconnected:
            self.disconnect(conn)
 manager = ConnectionManager()
 async def log_broadcaster():
    """后台任务：从队列读取日志并广播"""
    queue = crawler_manager.get_log_queue()
    while True:
        try:
            # 从队列获取日志条目
            entry = await queue.get()
            # 广播到所有 WebSocket 连接
            await manager.broadcast(entry.model_dump())
        except asyncio.CancelledError:
            break
        except Exception as e:
            print(f"Log broadcaster error: {e}")
            await asyncio.sleep(0.1)
 # 全局广播任务
 _broadcaster_task: Optional[asyncio.Task] = None
 def start_broadcaster():
    """启动广播任务"""
    global _broadcaster_task
    if _broadcaster_task is None or _broadcaster_task.done():
        _broadcaster_task = asyncio.create_task(log_broadcaster())
@router.websocket("/ws/logs")
 async def websocket_logs(websocket: WebSocket):
    """WebSocket 日志流"""
    print("[WS] New connection attempt")
    try:
        # 确保广播任务在运行
        start_broadcaster()
        await manager.connect(websocket)
        print(f"[WS] Connected, active connections: {len(manager.active_connections)}")
        # 发送现有日志
        for log in crawler_manager.logs:
            try:
                await websocket.send_json(log.model_dump())
            except Exception as e:
                print(f"[WS] Error sending existing log: {e}")
                break
        print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")
        while True:
            # 保持连接活跃，接收心跳或任意消息
            try:
                data = await asyncio.wait_for(
                    websocket.receive_text(),
                    timeout=30.0
                )
                if data == "ping":
                    await websocket.send_text("pong")
            except asyncio.TimeoutError:
                # 发送 ping 保持连接
                try:
                    await websocket.send_text("ping")
                except Exception as e:
                    print(f"[WS] Error sending ping: {e}")
                    break
    except WebSocketDisconnect:
        print("[WS] Client disconnected")
    except Exception as e:
        print(f"[WS] Error: {type(e).__name__}: {e}")
    finally:
        manager.disconnect(websocket)
        print(f"[WS] Cleanup done, active connections: {len(manager.active_connections)}")
@router.websocket("/ws/status")
 async def websocket_status(websocket: WebSocket):
    """WebSocket 状态流"""
    await websocket.accept()
    try:
        while True:
            # 每秒发送一次状态
            status = crawler_manager.get_status()
            await websocket.send_json(status)
            await asyncio.sleep(1)
    except WebSocketDisconnect:
        pass
    except Exception:
        pass
--- a/api/schemas/init.py
+++ b/api/schemas/init.py
@@ -0,0 +1,37 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/schemas/__init__.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 from .crawler import (
    PlatformEnum,
    LoginTypeEnum,
    CrawlerTypeEnum,
    SaveDataOptionEnum,
    CrawlerStartRequest,
    CrawlerStatusResponse,
    LogEntry,
 )
 __all__ = [
    "PlatformEnum",
    "LoginTypeEnum",
    "CrawlerTypeEnum",
    "SaveDataOptionEnum",
    "CrawlerStartRequest",
    "CrawlerStatusResponse",
    "LogEntry",
 ]
--- a/api/schemas/crawler.py
+++ b/api/schemas/crawler.py
@@ -0,0 +1,98 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/schemas/crawler.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 from enum import Enum
 from typing import Optional, Literal
 from pydantic import BaseModel
 class PlatformEnum(str, Enum):
    """支持的媒体平台"""
    XHS = "xhs"
    DOUYIN = "dy"
    KUAISHOU = "ks"
    BILIBILI = "bili"
    WEIBO = "wb"
    TIEBA = "tieba"
    ZHIHU = "zhihu"
 class LoginTypeEnum(str, Enum):
    """登录方式"""
    QRCODE = "qrcode"
    PHONE = "phone"
    COOKIE = "cookie"
 class CrawlerTypeEnum(str, Enum):
    """爬虫类型"""
    SEARCH = "search"
    DETAIL = "detail"
    CREATOR = "creator"
 class SaveDataOptionEnum(str, Enum):
    """数据保存方式"""
    CSV = "csv"
    DB = "db"
    JSON = "json"
    SQLITE = "sqlite"
    MONGODB = "mongodb"
    EXCEL = "excel"
 class CrawlerStartRequest(BaseModel):
    """启动爬虫请求"""
    platform: PlatformEnum
    login_type: LoginTypeEnum = LoginTypeEnum.QRCODE
    crawler_type: CrawlerTypeEnum = CrawlerTypeEnum.SEARCH
    keywords: str = ""  # 搜索模式下的关键词
    specified_ids: str = ""  # 详情模式下的帖子/视频ID列表，逗号分隔
    creator_ids: str = ""  # 创作者模式下的创作者ID列表，逗号分隔
    start_page: int = 1
    enable_comments: bool = True
    enable_sub_comments: bool = False
    save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSON
    cookies: str = ""
    headless: bool = False
 class CrawlerStatusResponse(BaseModel):
    """爬虫状态响应"""
    status: Literal["idle", "running", "stopping", "error"]
    platform: Optional[str] = None
    crawler_type: Optional[str] = None
    started_at: Optional[str] = None
    error_message: Optional[str] = None
 class LogEntry(BaseModel):
    """日志条目"""
    id: int
    timestamp: str
    level: Literal["info", "warning", "error", "success", "debug"]
    message: str
 class DataFileInfo(BaseModel):
    """数据文件信息"""
    name: str
    path: str
    size: int
    modified_at: str
    record_count: Optional[int] = None
--- a/api/services/init.py
+++ b/api/services/init.py
@@ -0,0 +1,21 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/services/__init__.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 from .crawler_manager import CrawlerManager, crawler_manager
 __all__ = ["CrawlerManager", "crawler_manager"]
--- a/api/services/crawler_manager.py
+++ b/api/services/crawler_manager.py
@@ -0,0 +1,281 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2025 relakkes@gmail.com
 #
 # This file is part of MediaCrawler project.
 # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/services/crawler_manager.py
 # GitHub: https://github.com/NanmiCoder
 # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
 #
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 import asyncio
 import subprocess
 import signal
 import os
 from typing import Optional, List
 from datetime import datetime
 from pathlib import Path
 from ..schemas import CrawlerStartRequest, LogEntry
 class CrawlerManager:
    """爬虫进程管理器"""
    def __init__(self):
        self._lock = asyncio.Lock()
        self.process: Optional[subprocess.Popen] = None
        self.status = "idle"
        self.started_at: Optional[datetime] = None
        self.current_config: Optional[CrawlerStartRequest] = None
        self._log_id = 0
        self._logs: List[LogEntry] = []
        self._read_task: Optional[asyncio.Task] = None
        # 项目根目录
        self._project_root = Path(__file__).parent.parent.parent
        # 日志队列 - 用于向 WebSocket 推送
        self._log_queue: Optional[asyncio.Queue] = None
    @property
    def logs(self) -> List[LogEntry]:
        return self._logs
    def get_log_queue(self) -> asyncio.Queue:
        """获取或创建日志队列"""
        if self._log_queue is None:
            self._log_queue = asyncio.Queue()
        return self._log_queue
    def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
        """创建日志条目"""
        self._log_id += 1
        entry = LogEntry(
            id=self._log_id,
            timestamp=datetime.now().strftime("%H:%M:%S"),
            level=level,
            message=message
        )
        self._logs.append(entry)
        # 保留最近 500 条日志
        if len(self._logs) > 500:
            self._logs = self._logs[-500:]
        return entry
    async def _push_log(self, entry: LogEntry):
        """推送日志到队列"""
        if self._log_queue is not None:
            try:
                self._log_queue.put_nowait(entry)
            except asyncio.QueueFull:
                pass
    def _parse_log_level(self, line: str) -> str:
        """解析日志级别"""
        line_upper = line.upper()
        if "ERROR" in line_upper or "FAILED" in line_upper:
            return "error"
        elif "WARNING" in line_upper or "WARN" in line_upper:
            return "warning"
        elif "SUCCESS" in line_upper or "完成" in line or "成功" in line:
            return "success"
        elif "DEBUG" in line_upper:
            return "debug"
        return "info"
    async def start(self, config: CrawlerStartRequest) -> bool:
        """启动爬虫进程"""
        async with self._lock:
            if self.process and self.process.poll() is None:
                return False
            # 清空旧日志
            self._logs = []
            self._log_id = 0
            # 清空待推送队列（不要替换对象，避免 WebSocket 广播协程持有旧队列引用）
            if self._log_queue is None:
                self._log_queue = asyncio.Queue()
            else:
                try:
                    while True:
                        self._log_queue.get_nowait()
                except asyncio.QueueEmpty:
                    pass
            # 构建命令行参数
            cmd = self._build_command(config)
            # 记录启动日志
            entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
            await self._push_log(entry)
            try:
                # 启动子进程
                self.process = subprocess.Popen(
                    cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    text=True,
                    bufsize=1,
                    cwd=str(self._project_root),
                    env={**os.environ, "PYTHONUNBUFFERED": "1"}
                )
                self.status = "running"
                self.started_at = datetime.now()
                self.current_config = config
                entry = self._create_log_entry(
                    f"Crawler started on platform: {config.platform.value}, type: {config.crawler_type.value}",
                    "success"
                )
                await self._push_log(entry)
                # 启动日志读取任务
                self._read_task = asyncio.create_task(self._read_output())
                return True
            except Exception as e:
                self.status = "error"
                entry = self._create_log_entry(f"Failed to start crawler: {str(e)}", "error")
                await self._push_log(entry)
                return False
    async def stop(self) -> bool:
        """停止爬虫进程"""
        async with self._lock:
            if not self.process or self.process.poll() is not None:
                return False
            self.status = "stopping"
            entry = self._create_log_entry("Sending SIGTERM to crawler process...", "warning")
            await self._push_log(entry)
            try:
                self.process.send_signal(signal.SIGTERM)
                # 等待优雅退出 (最多15秒)
                for _ in range(30):
                    if self.process.poll() is not None:
                        break
                    await asyncio.sleep(0.5)
                # 如果还没退出，强制杀死
                if self.process.poll() is None:
                    entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
                    await self._push_log(entry)
                    self.process.kill()
                entry = self._create_log_entry("Crawler process terminated", "info")
                await self._push_log(entry)
            except Exception as e:
                entry = self._create_log_entry(f"Error stopping crawler: {str(e)}", "error")
                await self._push_log(entry)
            self.status = "idle"
            self.current_config = None
            # 取消日志读取任务
            if self._read_task:
                self._read_task.cancel()
                self._read_task = None
            return True
    def get_status(self) -> dict:
        """获取当前状态"""
        return {
            "status": self.status,
            "platform": self.current_config.platform.value if self.current_config else None,
            "crawler_type": self.current_config.crawler_type.value if self.current_config else None,
            "started_at": self.started_at.isoformat() if self.started_at else None,
            "error_message": None
        }
    def _build_command(self, config: CrawlerStartRequest) -> list:
        """构建 main.py 命令行参数"""
        cmd = ["python", "main.py"]
        cmd.extend(["--platform", config.platform.value])
        cmd.extend(["--lt", config.login_type.value])
        cmd.extend(["--type", config.crawler_type.value])
        cmd.extend(["--save_data_option", config.save_option.value])
        # 根据爬虫类型传递不同的参数
        if config.crawler_type.value == "search" and config.keywords:
            cmd.extend(["--keywords", config.keywords])
        elif config.crawler_type.value == "detail" and config.specified_ids:
            cmd.extend(["--specified_id", config.specified_ids])
        elif config.crawler_type.value == "creator" and config.creator_ids:
            cmd.extend(["--creator_id", config.creator_ids])
        if config.start_page != 1:
            cmd.extend(["--start", str(config.start_page)])
        cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
        cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
        if config.cookies:
            cmd.extend(["--cookies", config.cookies])
        cmd.extend(["--headless", "true" if config.headless else "false"])
        return cmd
    async def _read_output(self):
        """异步读取进程输出"""
        loop = asyncio.get_event_loop()
        try:
            while self.process and self.process.poll() is None:
                # 在线程池中读取一行
                line = await loop.run_in_executor(
                    None, self.process.stdout.readline
                )
                if line:
                    line = line.strip()
                    if line:
                        level = self._parse_log_level(line)
                        entry = self._create_log_entry(line, level)
                        await self._push_log(entry)
            # 读取剩余输出
            if self.process and self.process.stdout:
                remaining = await loop.run_in_executor(
                    None, self.process.stdout.read
                )
                if remaining:
                    for line in remaining.strip().split('\n'):
                        if line.strip():
                            level = self._parse_log_level(line)
                            entry = self._create_log_entry(line.strip(), level)
                            await self._push_log(entry)
            # 进程结束
            if self.status == "running":
                exit_code = self.process.returncode if self.process else -1
                if exit_code == 0:
                    entry = self._create_log_entry("Crawler completed successfully", "success")
                else:
                    entry = self._create_log_entry(f"Crawler exited with code: {exit_code}", "warning")
                await self._push_log(entry)
                self.status = "idle"
        except asyncio.CancelledError:
            pass
        except Exception as e:
            entry = self._create_log_entry(f"Error reading output: {str(e)}", "error")
            await self._push_log(entry)
 # 全局单例
 crawler_manager = CrawlerManager()
--- a/api/webui/assets/index-BKWwy9pb.css
+++ b/api/webui/assets/index-BKWwy9pb.css
--- a/api/webui/assets/index-DQPd_23u.js
+++ b/api/webui/assets/index-DQPd_23u.js
--- a/api/webui/index.html
+++ b/api/webui/index.html
@@ -0,0 +1,17 @@
 <!doctype html>
 <html lang="zh-CN">
  <head>
    <meta charset="UTF-8" />
    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>MediaCrawler - Command Center</title>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
    <script type="module" crossorigin src="/assets/index-DQPd_23u.js"></script>
    <link rel="stylesheet" crossorigin href="/assets/index-BKWwy9pb.css">
  </head>
  <body>
    <div id="root"></div>
  </body>
 </html>
--- a/api/webui/logos/bilibili_logo.png
+++ b/api/webui/logos/bilibili_logo.png
--- a/api/webui/logos/douyin.png
+++ b/api/webui/logos/douyin.png
--- a/api/webui/logos/github.png
+++ b/api/webui/logos/github.png
--- a/api/webui/logos/my_logo.png
+++ b/api/webui/logos/my_logo.png
--- a/api/webui/logos/xiaohongshu_logo.png
+++ b/api/webui/logos/xiaohongshu_logo.png
--- a/api/webui/vite.svg
+++ b/api/webui/vite.svg
@@ -0,0 +1 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40" fill="#de283b"/></svg>
		`@@ -0,0 +1 @@`
							`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40" fill="#de283b"/></svg>`