feat(api): add WebUI API server with built frontend

- Add FastAPI server with WebSocket support for real-time logs - Add crawler management API endpoints (start/stop/status) - Add data browsing API endpoints (list files, preview, download) - Include pre-built WebUI assets for serving frontend API endpoints: - POST /api/crawler/start - Start crawler task - POST /api/crawler/stop - Stop crawler task - GET /api/crawler/status - Get crawler status - WS /api/ws/logs - Real-time log streaming - GET /api/data/files - List data files - GET /api/data/stats - Get data statistics 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-09 20:17:42 +08:00 · 2025-12-19 00:02:08 +08:00
parent eb66e57f60
commit 508675a251
20 changed files with 1467 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -178,4 +178,4 @@ docs/.vitepress/cache
 agent_zone
 debug_tools

-database/*.db
+database/*.db
--- a/api/init.py
+++ b/api/init.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/__init__.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+# WebUI API Module for MediaCrawler
--- a/api/main.py
+++ b/api/main.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/main.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+"""
+MediaCrawler WebUI API Server
+启动命令: uvicorn api.main:app --port 8080 --reload
+或者: python -m api.main
+"""
+import asyncio
+import os
+import subprocess
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+
+from .routers import crawler_router, data_router, websocket_router
+
+app = FastAPI(
+    title="MediaCrawler WebUI API",
+    description="API for controlling MediaCrawler from WebUI",
+    version="1.0.0"
+)
+
+# 获取 webui 静态文件目录
+WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui")
+
+# CORS 配置 - 允许前端开发服务器访问
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:5173",  # Vite dev server
+        "http://localhost:3000",  # 备用端口
+        "http://127.0.0.1:5173",
+        "http://127.0.0.1:3000",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# 注册路由
+app.include_router(crawler_router, prefix="/api")
+app.include_router(data_router, prefix="/api")
+app.include_router(websocket_router, prefix="/api")
+
+
+@app.get("/")
+async def serve_frontend():
+    """返回前端页面"""
+    index_path = os.path.join(WEBUI_DIR, "index.html")
+    if os.path.exists(index_path):
+        return FileResponse(index_path)
+    return {
+        "message": "MediaCrawler WebUI API",
+        "version": "1.0.0",
+        "docs": "/docs",
+        "note": "WebUI not found, please build it first: cd webui && npm run build"
+    }
+
+
+@app.get("/api/health")
+async def health_check():
+    return {"status": "ok"}
+
+
+@app.get("/api/env/check")
+async def check_environment():
+    """检测 MediaCrawler 环境是否配置正确"""
+    try:
+        # 运行 uv run main.py --help 命令检测环境
+        process = await asyncio.create_subprocess_exec(
+            "uv", "run", "main.py", "--help",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            cwd="."  # 项目根目录
+        )
+        stdout, stderr = await asyncio.wait_for(
+            process.communicate(),
+            timeout=30.0  # 30秒超时
+        )
+
+        if process.returncode == 0:
+            return {
+                "success": True,
+                "message": "MediaCrawler 环境配置正确",
+                "output": stdout.decode("utf-8", errors="ignore")[:500]  # 截取前500字符
+            }
+        else:
+            error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore")
+            return {
+                "success": False,
+                "message": "环境检测失败",
+                "error": error_msg[:500]
+            }
+    except asyncio.TimeoutError:
+        return {
+            "success": False,
+            "message": "环境检测超时",
+            "error": "命令执行超过30秒"
+        }
+    except FileNotFoundError:
+        return {
+            "success": False,
+            "message": "未找到 uv 命令",
+            "error": "请确保已安装 uv 并配置到系统 PATH"
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "message": "环境检测出错",
+            "error": str(e)
+        }
+
+
+@app.get("/api/config/platforms")
+async def get_platforms():
+    """获取支持的平台列表"""
+    return {
+        "platforms": [
+            {"value": "xhs", "label": "小红书", "icon": "book-open"},
+            {"value": "dy", "label": "抖音", "icon": "music"},
+            {"value": "ks", "label": "快手", "icon": "video"},
+            {"value": "bili", "label": "哔哩哔哩", "icon": "tv"},
+            {"value": "wb", "label": "微博", "icon": "message-circle"},
+            {"value": "tieba", "label": "百度贴吧", "icon": "messages-square"},
+            {"value": "zhihu", "label": "知乎", "icon": "help-circle"},
+        ]
+    }
+
+
+@app.get("/api/config/options")
+async def get_config_options():
+    """获取所有配置选项"""
+    return {
+        "login_types": [
+            {"value": "qrcode", "label": "二维码登录"},
+            {"value": "cookie", "label": "Cookie登录"},
+        ],
+        "crawler_types": [
+            {"value": "search", "label": "搜索模式"},
+            {"value": "detail", "label": "详情模式"},
+            {"value": "creator", "label": "创作者模式"},
+        ],
+        "save_options": [
+            {"value": "json", "label": "JSON 文件"},
+            {"value": "csv", "label": "CSV 文件"},
+            {"value": "excel", "label": "Excel 文件"},
+            {"value": "sqlite", "label": "SQLite 数据库"},
+            {"value": "db", "label": "MySQL 数据库"},
+            {"value": "mongodb", "label": "MongoDB 数据库"},
+        ],
+    }
+
+
+# 挂载静态资源 - 必须放在所有路由之后
+if os.path.exists(WEBUI_DIR):
+    assets_dir = os.path.join(WEBUI_DIR, "assets")
+    if os.path.exists(assets_dir):
+        app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
+    # 挂载 logos 目录
+    logos_dir = os.path.join(WEBUI_DIR, "logos")
+    if os.path.exists(logos_dir):
+        app.mount("/logos", StaticFiles(directory=logos_dir), name="logos")
+    # 挂载其他静态文件（如 vite.svg）
+    app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static")
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8080)
--- a/api/routers/init.py
+++ b/api/routers/init.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/__init__.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+from .crawler import router as crawler_router
+from .data import router as data_router
+from .websocket import router as websocket_router
+
+__all__ = ["crawler_router", "data_router", "websocket_router"]
--- a/api/routers/crawler.py
+++ b/api/routers/crawler.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/crawler.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+from fastapi import APIRouter, HTTPException
+
+from ..schemas import CrawlerStartRequest, CrawlerStatusResponse
+from ..services import crawler_manager
+
+router = APIRouter(prefix="/crawler", tags=["crawler"])
+
+
+@router.post("/start")
+async def start_crawler(request: CrawlerStartRequest):
+    """启动爬虫任务"""
+    success = await crawler_manager.start(request)
+    if not success:
+        # 处理并发/重复请求：如果进程已经在跑，返回 400 而不是 500
+        if crawler_manager.process and crawler_manager.process.poll() is None:
+            raise HTTPException(status_code=400, detail="Crawler is already running")
+        raise HTTPException(status_code=500, detail="Failed to start crawler")
+
+    return {"status": "ok", "message": "Crawler started successfully"}
+
+
+@router.post("/stop")
+async def stop_crawler():
+    """停止爬虫任务"""
+    success = await crawler_manager.stop()
+    if not success:
+        # 处理并发/重复请求：如果进程已退出/不存在，返回 400 而不是 500
+        if not crawler_manager.process or crawler_manager.process.poll() is not None:
+            raise HTTPException(status_code=400, detail="No crawler is running")
+        raise HTTPException(status_code=500, detail="Failed to stop crawler")
+
+    return {"status": "ok", "message": "Crawler stopped successfully"}
+
+
+@router.get("/status", response_model=CrawlerStatusResponse)
+async def get_crawler_status():
+    """获取爬虫状态"""
+    return crawler_manager.get_status()
+
+
+@router.get("/logs")
+async def get_logs(limit: int = 100):
+    """获取最近的日志"""
+    logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
+    return {"logs": [log.model_dump() for log in logs]}
--- a/api/routers/data.py
+++ b/api/routers/data.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/data.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import os
+import json
+from pathlib import Path
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import FileResponse
+
+router = APIRouter(prefix="/data", tags=["data"])
+
+# 数据目录
+DATA_DIR = Path(__file__).parent.parent.parent / "data"
+
+
+def get_file_info(file_path: Path) -> dict:
+    """获取文件信息"""
+    stat = file_path.stat()
+    record_count = None
+
+    # 尝试获取记录数
+    try:
+        if file_path.suffix == ".json":
+            with open(file_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    record_count = len(data)
+        elif file_path.suffix == ".csv":
+            with open(file_path, "r", encoding="utf-8") as f:
+                record_count = sum(1 for _ in f) - 1  # 减去标题行
+    except Exception:
+        pass
+
+    return {
+        "name": file_path.name,
+        "path": str(file_path.relative_to(DATA_DIR)),
+        "size": stat.st_size,
+        "modified_at": stat.st_mtime,
+        "record_count": record_count,
+        "type": file_path.suffix[1:] if file_path.suffix else "unknown"
+    }
+
+
+@router.get("/files")
+async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
+    """获取数据文件列表"""
+    if not DATA_DIR.exists():
+        return {"files": []}
+
+    files = []
+    supported_extensions = {".json", ".csv", ".xlsx", ".xls"}
+
+    for root, dirs, filenames in os.walk(DATA_DIR):
+        root_path = Path(root)
+        for filename in filenames:
+            file_path = root_path / filename
+            if file_path.suffix.lower() not in supported_extensions:
+                continue
+
+            # 平台过滤
+            if platform:
+                rel_path = str(file_path.relative_to(DATA_DIR))
+                if platform.lower() not in rel_path.lower():
+                    continue
+
+            # 类型过滤
+            if file_type and file_path.suffix[1:].lower() != file_type.lower():
+                continue
+
+            try:
+                files.append(get_file_info(file_path))
+            except Exception:
+                continue
+
+    # 按修改时间排序（最新的在前）
+    files.sort(key=lambda x: x["modified_at"], reverse=True)
+
+    return {"files": files}
+
+
+@router.get("/files/{file_path:path}")
+async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
+    """获取文件内容或预览"""
+    full_path = DATA_DIR / file_path
+
+    if not full_path.exists():
+        raise HTTPException(status_code=404, detail="File not found")
+
+    if not full_path.is_file():
+        raise HTTPException(status_code=400, detail="Not a file")
+
+    # 安全检查：确保在 DATA_DIR 内
+    try:
+        full_path.resolve().relative_to(DATA_DIR.resolve())
+    except ValueError:
+        raise HTTPException(status_code=403, detail="Access denied")
+
+    if preview:
+        # 返回预览数据
+        try:
+            if full_path.suffix == ".json":
+                with open(full_path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                    if isinstance(data, list):
+                        return {"data": data[:limit], "total": len(data)}
+                    return {"data": data, "total": 1}
+            elif full_path.suffix == ".csv":
+                import csv
+                with open(full_path, "r", encoding="utf-8") as f:
+                    reader = csv.DictReader(f)
+                    rows = []
+                    for i, row in enumerate(reader):
+                        if i >= limit:
+                            break
+                        rows.append(row)
+                    # 重新读取获取总数
+                    f.seek(0)
+                    total = sum(1 for _ in f) - 1
+                    return {"data": rows, "total": total}
+            elif full_path.suffix.lower() in (".xlsx", ".xls"):
+                import pandas as pd
+                # 读取前 limit 行
+                df = pd.read_excel(full_path, nrows=limit)
+                # 获取总行数（只读取第一列来节省内存）
+                df_count = pd.read_excel(full_path, usecols=[0])
+                total = len(df_count)
+                # 转换为字典列表，处理 NaN 值
+                rows = df.where(pd.notnull(df), None).to_dict(orient='records')
+                return {
+                    "data": rows,
+                    "total": total,
+                    "columns": list(df.columns)
+                }
+            else:
+                raise HTTPException(status_code=400, detail="Unsupported file type for preview")
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=400, detail="Invalid JSON file")
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+    else:
+        # 返回文件下载
+        return FileResponse(
+            path=full_path,
+            filename=full_path.name,
+            media_type="application/octet-stream"
+        )
+
+
+@router.get("/download/{file_path:path}")
+async def download_file(file_path: str):
+    """下载文件"""
+    full_path = DATA_DIR / file_path
+
+    if not full_path.exists():
+        raise HTTPException(status_code=404, detail="File not found")
+
+    if not full_path.is_file():
+        raise HTTPException(status_code=400, detail="Not a file")
+
+    # 安全检查
+    try:
+        full_path.resolve().relative_to(DATA_DIR.resolve())
+    except ValueError:
+        raise HTTPException(status_code=403, detail="Access denied")
+
+    return FileResponse(
+        path=full_path,
+        filename=full_path.name,
+        media_type="application/octet-stream"
+    )
+
+
+@router.get("/stats")
+async def get_data_stats():
+    """获取数据统计"""
+    if not DATA_DIR.exists():
+        return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}
+
+    stats = {
+        "total_files": 0,
+        "total_size": 0,
+        "by_platform": {},
+        "by_type": {}
+    }
+
+    supported_extensions = {".json", ".csv", ".xlsx", ".xls"}
+
+    for root, dirs, filenames in os.walk(DATA_DIR):
+        root_path = Path(root)
+        for filename in filenames:
+            file_path = root_path / filename
+            if file_path.suffix.lower() not in supported_extensions:
+                continue
+
+            try:
+                stat = file_path.stat()
+                stats["total_files"] += 1
+                stats["total_size"] += stat.st_size
+
+                # 按类型统计
+                file_type = file_path.suffix[1:].lower()
+                stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1
+
+                # 按平台统计（从路径推断）
+                rel_path = str(file_path.relative_to(DATA_DIR))
+                for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
+                    if platform in rel_path.lower():
+                        stats["by_platform"][platform] = stats["by_platform"].get(platform, 0) + 1
+                        break
+            except Exception:
+                continue
+
+    return stats
--- a/api/routers/websocket.py
+++ b/api/routers/websocket.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/websocket.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import asyncio
+from typing import Set, Optional
+
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect
+
+from ..services import crawler_manager
+
+router = APIRouter(tags=["websocket"])
+
+
+class ConnectionManager:
+    """WebSocket 连接管理器"""
+
+    def __init__(self):
+        self.active_connections: Set[WebSocket] = set()
+
+    async def connect(self, websocket: WebSocket):
+        await websocket.accept()
+        self.active_connections.add(websocket)
+
+    def disconnect(self, websocket: WebSocket):
+        self.active_connections.discard(websocket)
+
+    async def broadcast(self, message: dict):
+        """广播消息到所有连接"""
+        if not self.active_connections:
+            return
+
+        disconnected = []
+        for connection in list(self.active_connections):
+            try:
+                await connection.send_json(message)
+            except Exception:
+                disconnected.append(connection)
+
+        # 清理断开的连接
+        for conn in disconnected:
+            self.disconnect(conn)
+
+
+manager = ConnectionManager()
+
+
+async def log_broadcaster():
+    """后台任务：从队列读取日志并广播"""
+    queue = crawler_manager.get_log_queue()
+    while True:
+        try:
+            # 从队列获取日志条目
+            entry = await queue.get()
+            # 广播到所有 WebSocket 连接
+            await manager.broadcast(entry.model_dump())
+        except asyncio.CancelledError:
+            break
+        except Exception as e:
+            print(f"Log broadcaster error: {e}")
+            await asyncio.sleep(0.1)
+
+
+# 全局广播任务
+_broadcaster_task: Optional[asyncio.Task] = None
+
+
+def start_broadcaster():
+    """启动广播任务"""
+    global _broadcaster_task
+    if _broadcaster_task is None or _broadcaster_task.done():
+        _broadcaster_task = asyncio.create_task(log_broadcaster())
+
+
+@router.websocket("/ws/logs")
+async def websocket_logs(websocket: WebSocket):
+    """WebSocket 日志流"""
+    print("[WS] New connection attempt")
+
+    try:
+        # 确保广播任务在运行
+        start_broadcaster()
+
+        await manager.connect(websocket)
+        print(f"[WS] Connected, active connections: {len(manager.active_connections)}")
+
+        # 发送现有日志
+        for log in crawler_manager.logs:
+            try:
+                await websocket.send_json(log.model_dump())
+            except Exception as e:
+                print(f"[WS] Error sending existing log: {e}")
+                break
+
+        print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")
+
+        while True:
+            # 保持连接活跃，接收心跳或任意消息
+            try:
+                data = await asyncio.wait_for(
+                    websocket.receive_text(),
+                    timeout=30.0
+                )
+                if data == "ping":
+                    await websocket.send_text("pong")
+            except asyncio.TimeoutError:
+                # 发送 ping 保持连接
+                try:
+                    await websocket.send_text("ping")
+                except Exception as e:
+                    print(f"[WS] Error sending ping: {e}")
+                    break
+
+    except WebSocketDisconnect:
+        print("[WS] Client disconnected")
+    except Exception as e:
+        print(f"[WS] Error: {type(e).__name__}: {e}")
+    finally:
+        manager.disconnect(websocket)
+        print(f"[WS] Cleanup done, active connections: {len(manager.active_connections)}")
+
+
+@router.websocket("/ws/status")
+async def websocket_status(websocket: WebSocket):
+    """WebSocket 状态流"""
+    await websocket.accept()
+
+    try:
+        while True:
+            # 每秒发送一次状态
+            status = crawler_manager.get_status()
+            await websocket.send_json(status)
+            await asyncio.sleep(1)
+    except WebSocketDisconnect:
+        pass
+    except Exception:
+        pass
--- a/api/schemas/init.py
+++ b/api/schemas/init.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/schemas/__init__.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+from .crawler import (
+    PlatformEnum,
+    LoginTypeEnum,
+    CrawlerTypeEnum,
+    SaveDataOptionEnum,
+    CrawlerStartRequest,
+    CrawlerStatusResponse,
+    LogEntry,
+)
+
+__all__ = [
+    "PlatformEnum",
+    "LoginTypeEnum",
+    "CrawlerTypeEnum",
+    "SaveDataOptionEnum",
+    "CrawlerStartRequest",
+    "CrawlerStatusResponse",
+    "LogEntry",
+]
--- a/api/schemas/crawler.py
+++ b/api/schemas/crawler.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/schemas/crawler.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+from enum import Enum
+from typing import Optional, Literal
+from pydantic import BaseModel
+
+
+class PlatformEnum(str, Enum):
+    """支持的媒体平台"""
+    XHS = "xhs"
+    DOUYIN = "dy"
+    KUAISHOU = "ks"
+    BILIBILI = "bili"
+    WEIBO = "wb"
+    TIEBA = "tieba"
+    ZHIHU = "zhihu"
+
+
+class LoginTypeEnum(str, Enum):
+    """登录方式"""
+    QRCODE = "qrcode"
+    PHONE = "phone"
+    COOKIE = "cookie"
+
+
+class CrawlerTypeEnum(str, Enum):
+    """爬虫类型"""
+    SEARCH = "search"
+    DETAIL = "detail"
+    CREATOR = "creator"
+
+
+class SaveDataOptionEnum(str, Enum):
+    """数据保存方式"""
+    CSV = "csv"
+    DB = "db"
+    JSON = "json"
+    SQLITE = "sqlite"
+    MONGODB = "mongodb"
+    EXCEL = "excel"
+
+
+class CrawlerStartRequest(BaseModel):
+    """启动爬虫请求"""
+    platform: PlatformEnum
+    login_type: LoginTypeEnum = LoginTypeEnum.QRCODE
+    crawler_type: CrawlerTypeEnum = CrawlerTypeEnum.SEARCH
+    keywords: str = ""  # 搜索模式下的关键词
+    specified_ids: str = ""  # 详情模式下的帖子/视频ID列表，逗号分隔
+    creator_ids: str = ""  # 创作者模式下的创作者ID列表，逗号分隔
+    start_page: int = 1
+    enable_comments: bool = True
+    enable_sub_comments: bool = False
+    save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSON
+    cookies: str = ""
+    headless: bool = False
+
+
+class CrawlerStatusResponse(BaseModel):
+    """爬虫状态响应"""
+    status: Literal["idle", "running", "stopping", "error"]
+    platform: Optional[str] = None
+    crawler_type: Optional[str] = None
+    started_at: Optional[str] = None
+    error_message: Optional[str] = None
+
+
+class LogEntry(BaseModel):
+    """日志条目"""
+    id: int
+    timestamp: str
+    level: Literal["info", "warning", "error", "success", "debug"]
+    message: str
+
+
+class DataFileInfo(BaseModel):
+    """数据文件信息"""
+    name: str
+    path: str
+    size: int
+    modified_at: str
+    record_count: Optional[int] = None
--- a/api/services/init.py
+++ b/api/services/init.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/services/__init__.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+from .crawler_manager import CrawlerManager, crawler_manager
+
+__all__ = ["CrawlerManager", "crawler_manager"]
--- a/api/services/crawler_manager.py
+++ b/api/services/crawler_manager.py
@@ -0,0 +1,281 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 relakkes@gmail.com
+#
+# This file is part of MediaCrawler project.
+# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/services/crawler_manager.py
+# GitHub: https://github.com/NanmiCoder
+# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
+#
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import asyncio
+import subprocess
+import signal
+import os
+from typing import Optional, List
+from datetime import datetime
+from pathlib import Path
+
+from ..schemas import CrawlerStartRequest, LogEntry
+
+
+class CrawlerManager:
+    """爬虫进程管理器"""
+
+    def __init__(self):
+        self._lock = asyncio.Lock()
+        self.process: Optional[subprocess.Popen] = None
+        self.status = "idle"
+        self.started_at: Optional[datetime] = None
+        self.current_config: Optional[CrawlerStartRequest] = None
+        self._log_id = 0
+        self._logs: List[LogEntry] = []
+        self._read_task: Optional[asyncio.Task] = None
+        # 项目根目录
+        self._project_root = Path(__file__).parent.parent.parent
+        # 日志队列 - 用于向 WebSocket 推送
+        self._log_queue: Optional[asyncio.Queue] = None
+
+    @property
+    def logs(self) -> List[LogEntry]:
+        return self._logs
+
+    def get_log_queue(self) -> asyncio.Queue:
+        """获取或创建日志队列"""
+        if self._log_queue is None:
+            self._log_queue = asyncio.Queue()
+        return self._log_queue
+
+    def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
+        """创建日志条目"""
+        self._log_id += 1
+        entry = LogEntry(
+            id=self._log_id,
+            timestamp=datetime.now().strftime("%H:%M:%S"),
+            level=level,
+            message=message
+        )
+        self._logs.append(entry)
+        # 保留最近 500 条日志
+        if len(self._logs) > 500:
+            self._logs = self._logs[-500:]
+        return entry
+
+    async def _push_log(self, entry: LogEntry):
+        """推送日志到队列"""
+        if self._log_queue is not None:
+            try:
+                self._log_queue.put_nowait(entry)
+            except asyncio.QueueFull:
+                pass
+
+    def _parse_log_level(self, line: str) -> str:
+        """解析日志级别"""
+        line_upper = line.upper()
+        if "ERROR" in line_upper or "FAILED" in line_upper:
+            return "error"
+        elif "WARNING" in line_upper or "WARN" in line_upper:
+            return "warning"
+        elif "SUCCESS" in line_upper or "完成" in line or "成功" in line:
+            return "success"
+        elif "DEBUG" in line_upper:
+            return "debug"
+        return "info"
+
+    async def start(self, config: CrawlerStartRequest) -> bool:
+        """启动爬虫进程"""
+        async with self._lock:
+            if self.process and self.process.poll() is None:
+                return False
+
+            # 清空旧日志
+            self._logs = []
+            self._log_id = 0
+
+            # 清空待推送队列（不要替换对象，避免 WebSocket 广播协程持有旧队列引用）
+            if self._log_queue is None:
+                self._log_queue = asyncio.Queue()
+            else:
+                try:
+                    while True:
+                        self._log_queue.get_nowait()
+                except asyncio.QueueEmpty:
+                    pass
+
+            # 构建命令行参数
+            cmd = self._build_command(config)
+
+            # 记录启动日志
+            entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
+            await self._push_log(entry)
+
+            try:
+                # 启动子进程
+                self.process = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    bufsize=1,
+                    cwd=str(self._project_root),
+                    env={**os.environ, "PYTHONUNBUFFERED": "1"}
+                )
+
+                self.status = "running"
+                self.started_at = datetime.now()
+                self.current_config = config
+
+                entry = self._create_log_entry(
+                    f"Crawler started on platform: {config.platform.value}, type: {config.crawler_type.value}",
+                    "success"
+                )
+                await self._push_log(entry)
+
+                # 启动日志读取任务
+                self._read_task = asyncio.create_task(self._read_output())
+
+                return True
+            except Exception as e:
+                self.status = "error"
+                entry = self._create_log_entry(f"Failed to start crawler: {str(e)}", "error")
+                await self._push_log(entry)
+                return False
+
+    async def stop(self) -> bool:
+        """停止爬虫进程"""
+        async with self._lock:
+            if not self.process or self.process.poll() is not None:
+                return False
+
+            self.status = "stopping"
+            entry = self._create_log_entry("Sending SIGTERM to crawler process...", "warning")
+            await self._push_log(entry)
+
+            try:
+                self.process.send_signal(signal.SIGTERM)
+
+                # 等待优雅退出 (最多15秒)
+                for _ in range(30):
+                    if self.process.poll() is not None:
+                        break
+                    await asyncio.sleep(0.5)
+
+                # 如果还没退出，强制杀死
+                if self.process.poll() is None:
+                    entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
+                    await self._push_log(entry)
+                    self.process.kill()
+
+                entry = self._create_log_entry("Crawler process terminated", "info")
+                await self._push_log(entry)
+
+            except Exception as e:
+                entry = self._create_log_entry(f"Error stopping crawler: {str(e)}", "error")
+                await self._push_log(entry)
+
+            self.status = "idle"
+            self.current_config = None
+
+            # 取消日志读取任务
+            if self._read_task:
+                self._read_task.cancel()
+                self._read_task = None
+
+            return True
+
+    def get_status(self) -> dict:
+        """获取当前状态"""
+        return {
+            "status": self.status,
+            "platform": self.current_config.platform.value if self.current_config else None,
+            "crawler_type": self.current_config.crawler_type.value if self.current_config else None,
+            "started_at": self.started_at.isoformat() if self.started_at else None,
+            "error_message": None
+        }
+
+    def _build_command(self, config: CrawlerStartRequest) -> list:
+        """构建 main.py 命令行参数"""
+        cmd = ["python", "main.py"]
+
+        cmd.extend(["--platform", config.platform.value])
+        cmd.extend(["--lt", config.login_type.value])
+        cmd.extend(["--type", config.crawler_type.value])
+        cmd.extend(["--save_data_option", config.save_option.value])
+
+        # 根据爬虫类型传递不同的参数
+        if config.crawler_type.value == "search" and config.keywords:
+            cmd.extend(["--keywords", config.keywords])
+        elif config.crawler_type.value == "detail" and config.specified_ids:
+            cmd.extend(["--specified_id", config.specified_ids])
+        elif config.crawler_type.value == "creator" and config.creator_ids:
+            cmd.extend(["--creator_id", config.creator_ids])
+
+        if config.start_page != 1:
+            cmd.extend(["--start", str(config.start_page)])
+
+        cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
+        cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
+
+        if config.cookies:
+            cmd.extend(["--cookies", config.cookies])
+
+        cmd.extend(["--headless", "true" if config.headless else "false"])
+
+        return cmd
+
+    async def _read_output(self):
+        """异步读取进程输出"""
+        loop = asyncio.get_event_loop()
+
+        try:
+            while self.process and self.process.poll() is None:
+                # 在线程池中读取一行
+                line = await loop.run_in_executor(
+                    None, self.process.stdout.readline
+                )
+                if line:
+                    line = line.strip()
+                    if line:
+                        level = self._parse_log_level(line)
+                        entry = self._create_log_entry(line, level)
+                        await self._push_log(entry)
+
+            # 读取剩余输出
+            if self.process and self.process.stdout:
+                remaining = await loop.run_in_executor(
+                    None, self.process.stdout.read
+                )
+                if remaining:
+                    for line in remaining.strip().split('\n'):
+                        if line.strip():
+                            level = self._parse_log_level(line)
+                            entry = self._create_log_entry(line.strip(), level)
+                            await self._push_log(entry)
+
+            # 进程结束
+            if self.status == "running":
+                exit_code = self.process.returncode if self.process else -1
+                if exit_code == 0:
+                    entry = self._create_log_entry("Crawler completed successfully", "success")
+                else:
+                    entry = self._create_log_entry(f"Crawler exited with code: {exit_code}", "warning")
+                await self._push_log(entry)
+                self.status = "idle"
+
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            entry = self._create_log_entry(f"Error reading output: {str(e)}", "error")
+            await self._push_log(entry)
+
+
+# 全局单例
+crawler_manager = CrawlerManager()
--- a/api/webui/assets/index-BKWwy9pb.css
+++ b/api/webui/assets/index-BKWwy9pb.css
--- a/api/webui/assets/index-DQPd_23u.js
+++ b/api/webui/assets/index-DQPd_23u.js
--- a/api/webui/index.html
+++ b/api/webui/index.html
@@ -0,0 +1,17 @@
+<!doctype html>
+<html lang="zh-CN">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>MediaCrawler - Command Center</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
+    <script type="module" crossorigin src="/assets/index-DQPd_23u.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-BKWwy9pb.css">
+  </head>
+  <body>
+    <div id="root"></div>
+  </body>
+</html>
--- a/api/webui/logos/bilibili_logo.png
+++ b/api/webui/logos/bilibili_logo.png
--- a/api/webui/logos/douyin.png
+++ b/api/webui/logos/douyin.png
--- a/api/webui/logos/github.png
+++ b/api/webui/logos/github.png
--- a/api/webui/logos/my_logo.png
+++ b/api/webui/logos/my_logo.png
--- a/api/webui/logos/xiaohongshu_logo.png
+++ b/api/webui/logos/xiaohongshu_logo.png
--- a/api/webui/vite.svg
+++ b/api/webui/vite.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40" fill="#de283b"/></svg>
				`@@ -0,0 +1 @@`
				`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40" fill="#de283b"/></svg>`