mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 19:37:25 +08:00
feat(api): add WebUI API server with built frontend
- Add FastAPI server with WebSocket support for real-time logs - Add crawler management API endpoints (start/stop/status) - Add data browsing API endpoints (list files, preview, download) - Include pre-built WebUI assets for serving frontend API endpoints: - POST /api/crawler/start - Start crawler task - POST /api/crawler/stop - Stop crawler task - GET /api/crawler/status - Get crawler status - WS /api/ws/logs - Real-time log streaming - GET /api/data/files - List data files - GET /api/data/stats - Get data statistics 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
19
api/__init__.py
Normal file
19
api/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/__init__.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
# WebUI API Module for MediaCrawler
|
||||||
186
api/main.py
Normal file
186
api/main.py
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/main.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
"""
|
||||||
|
MediaCrawler WebUI API Server
|
||||||
|
启动命令: uvicorn api.main:app --port 8080 --reload
|
||||||
|
或者: python -m api.main
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
|
||||||
|
from .routers import crawler_router, data_router, websocket_router
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="MediaCrawler WebUI API",
|
||||||
|
description="API for controlling MediaCrawler from WebUI",
|
||||||
|
version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 获取 webui 静态文件目录
|
||||||
|
WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui")
|
||||||
|
|
||||||
|
# CORS 配置 - 允许前端开发服务器访问
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=[
|
||||||
|
"http://localhost:5173", # Vite dev server
|
||||||
|
"http://localhost:3000", # 备用端口
|
||||||
|
"http://127.0.0.1:5173",
|
||||||
|
"http://127.0.0.1:3000",
|
||||||
|
],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# 注册路由
|
||||||
|
app.include_router(crawler_router, prefix="/api")
|
||||||
|
app.include_router(data_router, prefix="/api")
|
||||||
|
app.include_router(websocket_router, prefix="/api")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def serve_frontend():
|
||||||
|
"""返回前端页面"""
|
||||||
|
index_path = os.path.join(WEBUI_DIR, "index.html")
|
||||||
|
if os.path.exists(index_path):
|
||||||
|
return FileResponse(index_path)
|
||||||
|
return {
|
||||||
|
"message": "MediaCrawler WebUI API",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"docs": "/docs",
|
||||||
|
"note": "WebUI not found, please build it first: cd webui && npm run build"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/health")
|
||||||
|
async def health_check():
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/env/check")
|
||||||
|
async def check_environment():
|
||||||
|
"""检测 MediaCrawler 环境是否配置正确"""
|
||||||
|
try:
|
||||||
|
# 运行 uv run main.py --help 命令检测环境
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
"uv", "run", "main.py", "--help",
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
cwd="." # 项目根目录
|
||||||
|
)
|
||||||
|
stdout, stderr = await asyncio.wait_for(
|
||||||
|
process.communicate(),
|
||||||
|
timeout=30.0 # 30秒超时
|
||||||
|
)
|
||||||
|
|
||||||
|
if process.returncode == 0:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": "MediaCrawler 环境配置正确",
|
||||||
|
"output": stdout.decode("utf-8", errors="ignore")[:500] # 截取前500字符
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "环境检测失败",
|
||||||
|
"error": error_msg[:500]
|
||||||
|
}
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "环境检测超时",
|
||||||
|
"error": "命令执行超过30秒"
|
||||||
|
}
|
||||||
|
except FileNotFoundError:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "未找到 uv 命令",
|
||||||
|
"error": "请确保已安装 uv 并配置到系统 PATH"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "环境检测出错",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/config/platforms")
|
||||||
|
async def get_platforms():
|
||||||
|
"""获取支持的平台列表"""
|
||||||
|
return {
|
||||||
|
"platforms": [
|
||||||
|
{"value": "xhs", "label": "小红书", "icon": "book-open"},
|
||||||
|
{"value": "dy", "label": "抖音", "icon": "music"},
|
||||||
|
{"value": "ks", "label": "快手", "icon": "video"},
|
||||||
|
{"value": "bili", "label": "哔哩哔哩", "icon": "tv"},
|
||||||
|
{"value": "wb", "label": "微博", "icon": "message-circle"},
|
||||||
|
{"value": "tieba", "label": "百度贴吧", "icon": "messages-square"},
|
||||||
|
{"value": "zhihu", "label": "知乎", "icon": "help-circle"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/config/options")
|
||||||
|
async def get_config_options():
|
||||||
|
"""获取所有配置选项"""
|
||||||
|
return {
|
||||||
|
"login_types": [
|
||||||
|
{"value": "qrcode", "label": "二维码登录"},
|
||||||
|
{"value": "cookie", "label": "Cookie登录"},
|
||||||
|
],
|
||||||
|
"crawler_types": [
|
||||||
|
{"value": "search", "label": "搜索模式"},
|
||||||
|
{"value": "detail", "label": "详情模式"},
|
||||||
|
{"value": "creator", "label": "创作者模式"},
|
||||||
|
],
|
||||||
|
"save_options": [
|
||||||
|
{"value": "json", "label": "JSON 文件"},
|
||||||
|
{"value": "csv", "label": "CSV 文件"},
|
||||||
|
{"value": "excel", "label": "Excel 文件"},
|
||||||
|
{"value": "sqlite", "label": "SQLite 数据库"},
|
||||||
|
{"value": "db", "label": "MySQL 数据库"},
|
||||||
|
{"value": "mongodb", "label": "MongoDB 数据库"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# 挂载静态资源 - 必须放在所有路由之后
|
||||||
|
if os.path.exists(WEBUI_DIR):
|
||||||
|
assets_dir = os.path.join(WEBUI_DIR, "assets")
|
||||||
|
if os.path.exists(assets_dir):
|
||||||
|
app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
|
||||||
|
# 挂载 logos 目录
|
||||||
|
logos_dir = os.path.join(WEBUI_DIR, "logos")
|
||||||
|
if os.path.exists(logos_dir):
|
||||||
|
app.mount("/logos", StaticFiles(directory=logos_dir), name="logos")
|
||||||
|
# 挂载其他静态文件(如 vite.svg)
|
||||||
|
app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8080)
|
||||||
23
api/routers/__init__.py
Normal file
23
api/routers/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/__init__.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
from .crawler import router as crawler_router
|
||||||
|
from .data import router as data_router
|
||||||
|
from .websocket import router as websocket_router
|
||||||
|
|
||||||
|
__all__ = ["crawler_router", "data_router", "websocket_router"]
|
||||||
63
api/routers/crawler.py
Normal file
63
api/routers/crawler.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/crawler.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from ..schemas import CrawlerStartRequest, CrawlerStatusResponse
|
||||||
|
from ..services import crawler_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/start")
|
||||||
|
async def start_crawler(request: CrawlerStartRequest):
|
||||||
|
"""启动爬虫任务"""
|
||||||
|
success = await crawler_manager.start(request)
|
||||||
|
if not success:
|
||||||
|
# 处理并发/重复请求:如果进程已经在跑,返回 400 而不是 500
|
||||||
|
if crawler_manager.process and crawler_manager.process.poll() is None:
|
||||||
|
raise HTTPException(status_code=400, detail="Crawler is already running")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to start crawler")
|
||||||
|
|
||||||
|
return {"status": "ok", "message": "Crawler started successfully"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/stop")
|
||||||
|
async def stop_crawler():
|
||||||
|
"""停止爬虫任务"""
|
||||||
|
success = await crawler_manager.stop()
|
||||||
|
if not success:
|
||||||
|
# 处理并发/重复请求:如果进程已退出/不存在,返回 400 而不是 500
|
||||||
|
if not crawler_manager.process or crawler_manager.process.poll() is not None:
|
||||||
|
raise HTTPException(status_code=400, detail="No crawler is running")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to stop crawler")
|
||||||
|
|
||||||
|
return {"status": "ok", "message": "Crawler stopped successfully"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/status", response_model=CrawlerStatusResponse)
|
||||||
|
async def get_crawler_status():
|
||||||
|
"""获取爬虫状态"""
|
||||||
|
return crawler_manager.get_status()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/logs")
|
||||||
|
async def get_logs(limit: int = 100):
|
||||||
|
"""获取最近的日志"""
|
||||||
|
logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
|
||||||
|
return {"logs": [log.model_dump() for log in logs]}
|
||||||
230
api/routers/data.py
Normal file
230
api/routers/data.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/data.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/data", tags=["data"])
|
||||||
|
|
||||||
|
# 数据目录
|
||||||
|
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_info(file_path: Path) -> dict:
|
||||||
|
"""获取文件信息"""
|
||||||
|
stat = file_path.stat()
|
||||||
|
record_count = None
|
||||||
|
|
||||||
|
# 尝试获取记录数
|
||||||
|
try:
|
||||||
|
if file_path.suffix == ".json":
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if isinstance(data, list):
|
||||||
|
record_count = len(data)
|
||||||
|
elif file_path.suffix == ".csv":
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
record_count = sum(1 for _ in f) - 1 # 减去标题行
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": file_path.name,
|
||||||
|
"path": str(file_path.relative_to(DATA_DIR)),
|
||||||
|
"size": stat.st_size,
|
||||||
|
"modified_at": stat.st_mtime,
|
||||||
|
"record_count": record_count,
|
||||||
|
"type": file_path.suffix[1:] if file_path.suffix else "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/files")
|
||||||
|
async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
|
||||||
|
"""获取数据文件列表"""
|
||||||
|
if not DATA_DIR.exists():
|
||||||
|
return {"files": []}
|
||||||
|
|
||||||
|
files = []
|
||||||
|
supported_extensions = {".json", ".csv", ".xlsx", ".xls"}
|
||||||
|
|
||||||
|
for root, dirs, filenames in os.walk(DATA_DIR):
|
||||||
|
root_path = Path(root)
|
||||||
|
for filename in filenames:
|
||||||
|
file_path = root_path / filename
|
||||||
|
if file_path.suffix.lower() not in supported_extensions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 平台过滤
|
||||||
|
if platform:
|
||||||
|
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||||
|
if platform.lower() not in rel_path.lower():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 类型过滤
|
||||||
|
if file_type and file_path.suffix[1:].lower() != file_type.lower():
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
files.append(get_file_info(file_path))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 按修改时间排序(最新的在前)
|
||||||
|
files.sort(key=lambda x: x["modified_at"], reverse=True)
|
||||||
|
|
||||||
|
return {"files": files}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/files/{file_path:path}")
|
||||||
|
async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
|
||||||
|
"""获取文件内容或预览"""
|
||||||
|
full_path = DATA_DIR / file_path
|
||||||
|
|
||||||
|
if not full_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
|
if not full_path.is_file():
|
||||||
|
raise HTTPException(status_code=400, detail="Not a file")
|
||||||
|
|
||||||
|
# 安全检查:确保在 DATA_DIR 内
|
||||||
|
try:
|
||||||
|
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=403, detail="Access denied")
|
||||||
|
|
||||||
|
if preview:
|
||||||
|
# 返回预览数据
|
||||||
|
try:
|
||||||
|
if full_path.suffix == ".json":
|
||||||
|
with open(full_path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if isinstance(data, list):
|
||||||
|
return {"data": data[:limit], "total": len(data)}
|
||||||
|
return {"data": data, "total": 1}
|
||||||
|
elif full_path.suffix == ".csv":
|
||||||
|
import csv
|
||||||
|
with open(full_path, "r", encoding="utf-8") as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
rows = []
|
||||||
|
for i, row in enumerate(reader):
|
||||||
|
if i >= limit:
|
||||||
|
break
|
||||||
|
rows.append(row)
|
||||||
|
# 重新读取获取总数
|
||||||
|
f.seek(0)
|
||||||
|
total = sum(1 for _ in f) - 1
|
||||||
|
return {"data": rows, "total": total}
|
||||||
|
elif full_path.suffix.lower() in (".xlsx", ".xls"):
|
||||||
|
import pandas as pd
|
||||||
|
# 读取前 limit 行
|
||||||
|
df = pd.read_excel(full_path, nrows=limit)
|
||||||
|
# 获取总行数(只读取第一列来节省内存)
|
||||||
|
df_count = pd.read_excel(full_path, usecols=[0])
|
||||||
|
total = len(df_count)
|
||||||
|
# 转换为字典列表,处理 NaN 值
|
||||||
|
rows = df.where(pd.notnull(df), None).to_dict(orient='records')
|
||||||
|
return {
|
||||||
|
"data": rows,
|
||||||
|
"total": total,
|
||||||
|
"columns": list(df.columns)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=400, detail="Unsupported file type for preview")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid JSON file")
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
else:
|
||||||
|
# 返回文件下载
|
||||||
|
return FileResponse(
|
||||||
|
path=full_path,
|
||||||
|
filename=full_path.name,
|
||||||
|
media_type="application/octet-stream"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/download/{file_path:path}")
|
||||||
|
async def download_file(file_path: str):
|
||||||
|
"""下载文件"""
|
||||||
|
full_path = DATA_DIR / file_path
|
||||||
|
|
||||||
|
if not full_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
|
if not full_path.is_file():
|
||||||
|
raise HTTPException(status_code=400, detail="Not a file")
|
||||||
|
|
||||||
|
# 安全检查
|
||||||
|
try:
|
||||||
|
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=403, detail="Access denied")
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path=full_path,
|
||||||
|
filename=full_path.name,
|
||||||
|
media_type="application/octet-stream"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/stats")
|
||||||
|
async def get_data_stats():
|
||||||
|
"""获取数据统计"""
|
||||||
|
if not DATA_DIR.exists():
|
||||||
|
return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"total_files": 0,
|
||||||
|
"total_size": 0,
|
||||||
|
"by_platform": {},
|
||||||
|
"by_type": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
supported_extensions = {".json", ".csv", ".xlsx", ".xls"}
|
||||||
|
|
||||||
|
for root, dirs, filenames in os.walk(DATA_DIR):
|
||||||
|
root_path = Path(root)
|
||||||
|
for filename in filenames:
|
||||||
|
file_path = root_path / filename
|
||||||
|
if file_path.suffix.lower() not in supported_extensions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
stat = file_path.stat()
|
||||||
|
stats["total_files"] += 1
|
||||||
|
stats["total_size"] += stat.st_size
|
||||||
|
|
||||||
|
# 按类型统计
|
||||||
|
file_type = file_path.suffix[1:].lower()
|
||||||
|
stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1
|
||||||
|
|
||||||
|
# 按平台统计(从路径推断)
|
||||||
|
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||||
|
for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
|
||||||
|
if platform in rel_path.lower():
|
||||||
|
stats["by_platform"][platform] = stats["by_platform"].get(platform, 0) + 1
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return stats
|
||||||
151
api/routers/websocket.py
Normal file
151
api/routers/websocket.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/routers/websocket.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Set, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
||||||
|
|
||||||
|
from ..services import crawler_manager
|
||||||
|
|
||||||
|
router = APIRouter(tags=["websocket"])
|
||||||
|
|
||||||
|
|
||||||
|
class ConnectionManager:
|
||||||
|
"""WebSocket 连接管理器"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.active_connections: Set[WebSocket] = set()
|
||||||
|
|
||||||
|
async def connect(self, websocket: WebSocket):
|
||||||
|
await websocket.accept()
|
||||||
|
self.active_connections.add(websocket)
|
||||||
|
|
||||||
|
def disconnect(self, websocket: WebSocket):
|
||||||
|
self.active_connections.discard(websocket)
|
||||||
|
|
||||||
|
async def broadcast(self, message: dict):
|
||||||
|
"""广播消息到所有连接"""
|
||||||
|
if not self.active_connections:
|
||||||
|
return
|
||||||
|
|
||||||
|
disconnected = []
|
||||||
|
for connection in list(self.active_connections):
|
||||||
|
try:
|
||||||
|
await connection.send_json(message)
|
||||||
|
except Exception:
|
||||||
|
disconnected.append(connection)
|
||||||
|
|
||||||
|
# 清理断开的连接
|
||||||
|
for conn in disconnected:
|
||||||
|
self.disconnect(conn)
|
||||||
|
|
||||||
|
|
||||||
|
manager = ConnectionManager()
|
||||||
|
|
||||||
|
|
||||||
|
async def log_broadcaster():
|
||||||
|
"""后台任务:从队列读取日志并广播"""
|
||||||
|
queue = crawler_manager.get_log_queue()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
# 从队列获取日志条目
|
||||||
|
entry = await queue.get()
|
||||||
|
# 广播到所有 WebSocket 连接
|
||||||
|
await manager.broadcast(entry.model_dump())
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Log broadcaster error: {e}")
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
# 全局广播任务
|
||||||
|
_broadcaster_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
|
||||||
|
def start_broadcaster():
|
||||||
|
"""启动广播任务"""
|
||||||
|
global _broadcaster_task
|
||||||
|
if _broadcaster_task is None or _broadcaster_task.done():
|
||||||
|
_broadcaster_task = asyncio.create_task(log_broadcaster())
|
||||||
|
|
||||||
|
|
||||||
|
@router.websocket("/ws/logs")
|
||||||
|
async def websocket_logs(websocket: WebSocket):
|
||||||
|
"""WebSocket 日志流"""
|
||||||
|
print("[WS] New connection attempt")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 确保广播任务在运行
|
||||||
|
start_broadcaster()
|
||||||
|
|
||||||
|
await manager.connect(websocket)
|
||||||
|
print(f"[WS] Connected, active connections: {len(manager.active_connections)}")
|
||||||
|
|
||||||
|
# 发送现有日志
|
||||||
|
for log in crawler_manager.logs:
|
||||||
|
try:
|
||||||
|
await websocket.send_json(log.model_dump())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WS] Error sending existing log: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# 保持连接活跃,接收心跳或任意消息
|
||||||
|
try:
|
||||||
|
data = await asyncio.wait_for(
|
||||||
|
websocket.receive_text(),
|
||||||
|
timeout=30.0
|
||||||
|
)
|
||||||
|
if data == "ping":
|
||||||
|
await websocket.send_text("pong")
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# 发送 ping 保持连接
|
||||||
|
try:
|
||||||
|
await websocket.send_text("ping")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WS] Error sending ping: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
print("[WS] Client disconnected")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WS] Error: {type(e).__name__}: {e}")
|
||||||
|
finally:
|
||||||
|
manager.disconnect(websocket)
|
||||||
|
print(f"[WS] Cleanup done, active connections: {len(manager.active_connections)}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.websocket("/ws/status")
|
||||||
|
async def websocket_status(websocket: WebSocket):
|
||||||
|
"""WebSocket 状态流"""
|
||||||
|
await websocket.accept()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# 每秒发送一次状态
|
||||||
|
status = crawler_manager.get_status()
|
||||||
|
await websocket.send_json(status)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
37
api/schemas/__init__.py
Normal file
37
api/schemas/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/schemas/__init__.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
from .crawler import (
|
||||||
|
PlatformEnum,
|
||||||
|
LoginTypeEnum,
|
||||||
|
CrawlerTypeEnum,
|
||||||
|
SaveDataOptionEnum,
|
||||||
|
CrawlerStartRequest,
|
||||||
|
CrawlerStatusResponse,
|
||||||
|
LogEntry,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"PlatformEnum",
|
||||||
|
"LoginTypeEnum",
|
||||||
|
"CrawlerTypeEnum",
|
||||||
|
"SaveDataOptionEnum",
|
||||||
|
"CrawlerStartRequest",
|
||||||
|
"CrawlerStatusResponse",
|
||||||
|
"LogEntry",
|
||||||
|
]
|
||||||
98
api/schemas/crawler.py
Normal file
98
api/schemas/crawler.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/schemas/crawler.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional, Literal
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class PlatformEnum(str, Enum):
|
||||||
|
"""支持的媒体平台"""
|
||||||
|
XHS = "xhs"
|
||||||
|
DOUYIN = "dy"
|
||||||
|
KUAISHOU = "ks"
|
||||||
|
BILIBILI = "bili"
|
||||||
|
WEIBO = "wb"
|
||||||
|
TIEBA = "tieba"
|
||||||
|
ZHIHU = "zhihu"
|
||||||
|
|
||||||
|
|
||||||
|
class LoginTypeEnum(str, Enum):
|
||||||
|
"""登录方式"""
|
||||||
|
QRCODE = "qrcode"
|
||||||
|
PHONE = "phone"
|
||||||
|
COOKIE = "cookie"
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerTypeEnum(str, Enum):
|
||||||
|
"""爬虫类型"""
|
||||||
|
SEARCH = "search"
|
||||||
|
DETAIL = "detail"
|
||||||
|
CREATOR = "creator"
|
||||||
|
|
||||||
|
|
||||||
|
class SaveDataOptionEnum(str, Enum):
|
||||||
|
"""数据保存方式"""
|
||||||
|
CSV = "csv"
|
||||||
|
DB = "db"
|
||||||
|
JSON = "json"
|
||||||
|
SQLITE = "sqlite"
|
||||||
|
MONGODB = "mongodb"
|
||||||
|
EXCEL = "excel"
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerStartRequest(BaseModel):
|
||||||
|
"""启动爬虫请求"""
|
||||||
|
platform: PlatformEnum
|
||||||
|
login_type: LoginTypeEnum = LoginTypeEnum.QRCODE
|
||||||
|
crawler_type: CrawlerTypeEnum = CrawlerTypeEnum.SEARCH
|
||||||
|
keywords: str = "" # 搜索模式下的关键词
|
||||||
|
specified_ids: str = "" # 详情模式下的帖子/视频ID列表,逗号分隔
|
||||||
|
creator_ids: str = "" # 创作者模式下的创作者ID列表,逗号分隔
|
||||||
|
start_page: int = 1
|
||||||
|
enable_comments: bool = True
|
||||||
|
enable_sub_comments: bool = False
|
||||||
|
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSON
|
||||||
|
cookies: str = ""
|
||||||
|
headless: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerStatusResponse(BaseModel):
|
||||||
|
"""爬虫状态响应"""
|
||||||
|
status: Literal["idle", "running", "stopping", "error"]
|
||||||
|
platform: Optional[str] = None
|
||||||
|
crawler_type: Optional[str] = None
|
||||||
|
started_at: Optional[str] = None
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class LogEntry(BaseModel):
|
||||||
|
"""日志条目"""
|
||||||
|
id: int
|
||||||
|
timestamp: str
|
||||||
|
level: Literal["info", "warning", "error", "success", "debug"]
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
class DataFileInfo(BaseModel):
|
||||||
|
"""数据文件信息"""
|
||||||
|
name: str
|
||||||
|
path: str
|
||||||
|
size: int
|
||||||
|
modified_at: str
|
||||||
|
record_count: Optional[int] = None
|
||||||
21
api/services/__init__.py
Normal file
21
api/services/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/services/__init__.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
from .crawler_manager import CrawlerManager, crawler_manager
|
||||||
|
|
||||||
|
__all__ = ["CrawlerManager", "crawler_manager"]
|
||||||
281
api/services/crawler_manager.py
Normal file
281
api/services/crawler_manager.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (c) 2025 relakkes@gmail.com
|
||||||
|
#
|
||||||
|
# This file is part of MediaCrawler project.
|
||||||
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/api/services/crawler_manager.py
|
||||||
|
# GitHub: https://github.com/NanmiCoder
|
||||||
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||||
|
#
|
||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
import os
|
||||||
|
from typing import Optional, List
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..schemas import CrawlerStartRequest, LogEntry
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerManager:
|
||||||
|
"""爬虫进程管理器"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self.process: Optional[subprocess.Popen] = None
|
||||||
|
self.status = "idle"
|
||||||
|
self.started_at: Optional[datetime] = None
|
||||||
|
self.current_config: Optional[CrawlerStartRequest] = None
|
||||||
|
self._log_id = 0
|
||||||
|
self._logs: List[LogEntry] = []
|
||||||
|
self._read_task: Optional[asyncio.Task] = None
|
||||||
|
# 项目根目录
|
||||||
|
self._project_root = Path(__file__).parent.parent.parent
|
||||||
|
# 日志队列 - 用于向 WebSocket 推送
|
||||||
|
self._log_queue: Optional[asyncio.Queue] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def logs(self) -> List[LogEntry]:
|
||||||
|
return self._logs
|
||||||
|
|
||||||
|
def get_log_queue(self) -> asyncio.Queue:
|
||||||
|
"""获取或创建日志队列"""
|
||||||
|
if self._log_queue is None:
|
||||||
|
self._log_queue = asyncio.Queue()
|
||||||
|
return self._log_queue
|
||||||
|
|
||||||
|
def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
|
||||||
|
"""创建日志条目"""
|
||||||
|
self._log_id += 1
|
||||||
|
entry = LogEntry(
|
||||||
|
id=self._log_id,
|
||||||
|
timestamp=datetime.now().strftime("%H:%M:%S"),
|
||||||
|
level=level,
|
||||||
|
message=message
|
||||||
|
)
|
||||||
|
self._logs.append(entry)
|
||||||
|
# 保留最近 500 条日志
|
||||||
|
if len(self._logs) > 500:
|
||||||
|
self._logs = self._logs[-500:]
|
||||||
|
return entry
|
||||||
|
|
||||||
|
async def _push_log(self, entry: LogEntry):
|
||||||
|
"""推送日志到队列"""
|
||||||
|
if self._log_queue is not None:
|
||||||
|
try:
|
||||||
|
self._log_queue.put_nowait(entry)
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _parse_log_level(self, line: str) -> str:
|
||||||
|
"""解析日志级别"""
|
||||||
|
line_upper = line.upper()
|
||||||
|
if "ERROR" in line_upper or "FAILED" in line_upper:
|
||||||
|
return "error"
|
||||||
|
elif "WARNING" in line_upper or "WARN" in line_upper:
|
||||||
|
return "warning"
|
||||||
|
elif "SUCCESS" in line_upper or "完成" in line or "成功" in line:
|
||||||
|
return "success"
|
||||||
|
elif "DEBUG" in line_upper:
|
||||||
|
return "debug"
|
||||||
|
return "info"
|
||||||
|
|
||||||
|
async def start(self, config: CrawlerStartRequest) -> bool:
|
||||||
|
"""启动爬虫进程"""
|
||||||
|
async with self._lock:
|
||||||
|
if self.process and self.process.poll() is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 清空旧日志
|
||||||
|
self._logs = []
|
||||||
|
self._log_id = 0
|
||||||
|
|
||||||
|
# 清空待推送队列(不要替换对象,避免 WebSocket 广播协程持有旧队列引用)
|
||||||
|
if self._log_queue is None:
|
||||||
|
self._log_queue = asyncio.Queue()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
self._log_queue.get_nowait()
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 构建命令行参数
|
||||||
|
cmd = self._build_command(config)
|
||||||
|
|
||||||
|
# 记录启动日志
|
||||||
|
entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 启动子进程
|
||||||
|
self.process = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
cwd=str(self._project_root),
|
||||||
|
env={**os.environ, "PYTHONUNBUFFERED": "1"}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.status = "running"
|
||||||
|
self.started_at = datetime.now()
|
||||||
|
self.current_config = config
|
||||||
|
|
||||||
|
entry = self._create_log_entry(
|
||||||
|
f"Crawler started on platform: {config.platform.value}, type: {config.crawler_type.value}",
|
||||||
|
"success"
|
||||||
|
)
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
# 启动日志读取任务
|
||||||
|
self._read_task = asyncio.create_task(self._read_output())
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.status = "error"
|
||||||
|
entry = self._create_log_entry(f"Failed to start crawler: {str(e)}", "error")
|
||||||
|
await self._push_log(entry)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def stop(self) -> bool:
|
||||||
|
"""停止爬虫进程"""
|
||||||
|
async with self._lock:
|
||||||
|
if not self.process or self.process.poll() is not None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.status = "stopping"
|
||||||
|
entry = self._create_log_entry("Sending SIGTERM to crawler process...", "warning")
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.process.send_signal(signal.SIGTERM)
|
||||||
|
|
||||||
|
# 等待优雅退出 (最多15秒)
|
||||||
|
for _ in range(30):
|
||||||
|
if self.process.poll() is not None:
|
||||||
|
break
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
# 如果还没退出,强制杀死
|
||||||
|
if self.process.poll() is None:
|
||||||
|
entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
|
||||||
|
await self._push_log(entry)
|
||||||
|
self.process.kill()
|
||||||
|
|
||||||
|
entry = self._create_log_entry("Crawler process terminated", "info")
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
entry = self._create_log_entry(f"Error stopping crawler: {str(e)}", "error")
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
self.status = "idle"
|
||||||
|
self.current_config = None
|
||||||
|
|
||||||
|
# 取消日志读取任务
|
||||||
|
if self._read_task:
|
||||||
|
self._read_task.cancel()
|
||||||
|
self._read_task = None
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_status(self) -> dict:
|
||||||
|
"""获取当前状态"""
|
||||||
|
return {
|
||||||
|
"status": self.status,
|
||||||
|
"platform": self.current_config.platform.value if self.current_config else None,
|
||||||
|
"crawler_type": self.current_config.crawler_type.value if self.current_config else None,
|
||||||
|
"started_at": self.started_at.isoformat() if self.started_at else None,
|
||||||
|
"error_message": None
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_command(self, config: CrawlerStartRequest) -> list:
|
||||||
|
"""构建 main.py 命令行参数"""
|
||||||
|
cmd = ["python", "main.py"]
|
||||||
|
|
||||||
|
cmd.extend(["--platform", config.platform.value])
|
||||||
|
cmd.extend(["--lt", config.login_type.value])
|
||||||
|
cmd.extend(["--type", config.crawler_type.value])
|
||||||
|
cmd.extend(["--save_data_option", config.save_option.value])
|
||||||
|
|
||||||
|
# 根据爬虫类型传递不同的参数
|
||||||
|
if config.crawler_type.value == "search" and config.keywords:
|
||||||
|
cmd.extend(["--keywords", config.keywords])
|
||||||
|
elif config.crawler_type.value == "detail" and config.specified_ids:
|
||||||
|
cmd.extend(["--specified_id", config.specified_ids])
|
||||||
|
elif config.crawler_type.value == "creator" and config.creator_ids:
|
||||||
|
cmd.extend(["--creator_id", config.creator_ids])
|
||||||
|
|
||||||
|
if config.start_page != 1:
|
||||||
|
cmd.extend(["--start", str(config.start_page)])
|
||||||
|
|
||||||
|
cmd.extend(["--get_comment", "true" if config.enable_comments else "false"])
|
||||||
|
cmd.extend(["--get_sub_comment", "true" if config.enable_sub_comments else "false"])
|
||||||
|
|
||||||
|
if config.cookies:
|
||||||
|
cmd.extend(["--cookies", config.cookies])
|
||||||
|
|
||||||
|
cmd.extend(["--headless", "true" if config.headless else "false"])
|
||||||
|
|
||||||
|
return cmd
|
||||||
|
|
||||||
|
async def _read_output(self):
|
||||||
|
"""异步读取进程输出"""
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while self.process and self.process.poll() is None:
|
||||||
|
# 在线程池中读取一行
|
||||||
|
line = await loop.run_in_executor(
|
||||||
|
None, self.process.stdout.readline
|
||||||
|
)
|
||||||
|
if line:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
level = self._parse_log_level(line)
|
||||||
|
entry = self._create_log_entry(line, level)
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
# 读取剩余输出
|
||||||
|
if self.process and self.process.stdout:
|
||||||
|
remaining = await loop.run_in_executor(
|
||||||
|
None, self.process.stdout.read
|
||||||
|
)
|
||||||
|
if remaining:
|
||||||
|
for line in remaining.strip().split('\n'):
|
||||||
|
if line.strip():
|
||||||
|
level = self._parse_log_level(line)
|
||||||
|
entry = self._create_log_entry(line.strip(), level)
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
# 进程结束
|
||||||
|
if self.status == "running":
|
||||||
|
exit_code = self.process.returncode if self.process else -1
|
||||||
|
if exit_code == 0:
|
||||||
|
entry = self._create_log_entry("Crawler completed successfully", "success")
|
||||||
|
else:
|
||||||
|
entry = self._create_log_entry(f"Crawler exited with code: {exit_code}", "warning")
|
||||||
|
await self._push_log(entry)
|
||||||
|
self.status = "idle"
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
entry = self._create_log_entry(f"Error reading output: {str(e)}", "error")
|
||||||
|
await self._push_log(entry)
|
||||||
|
|
||||||
|
|
||||||
|
# 全局单例
|
||||||
|
crawler_manager = CrawlerManager()
|
||||||
1
api/webui/assets/index-BKWwy9pb.css
Normal file
1
api/webui/assets/index-BKWwy9pb.css
Normal file
File diff suppressed because one or more lines are too long
338
api/webui/assets/index-DQPd_23u.js
Normal file
338
api/webui/assets/index-DQPd_23u.js
Normal file
File diff suppressed because one or more lines are too long
17
api/webui/index.html
Normal file
17
api/webui/index.html
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>MediaCrawler - Command Center</title>
|
||||||
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
|
||||||
|
<script type="module" crossorigin src="/assets/index-DQPd_23u.js"></script>
|
||||||
|
<link rel="stylesheet" crossorigin href="/assets/index-BKWwy9pb.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="root"></div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
BIN
api/webui/logos/bilibili_logo.png
Normal file
BIN
api/webui/logos/bilibili_logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 42 KiB |
BIN
api/webui/logos/douyin.png
Normal file
BIN
api/webui/logos/douyin.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
api/webui/logos/github.png
Normal file
BIN
api/webui/logos/github.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.8 KiB |
BIN
api/webui/logos/my_logo.png
Normal file
BIN
api/webui/logos/my_logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 312 KiB |
BIN
api/webui/logos/xiaohongshu_logo.png
Normal file
BIN
api/webui/logos/xiaohongshu_logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 6.2 KiB |
1
api/webui/vite.svg
Normal file
1
api/webui/vite.svg
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40" fill="#de283b"/></svg>
|
||||||
|
After Width: | Height: | Size: 116 B |
Reference in New Issue
Block a user