mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-03 20:50:47 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
84
api/main.py
84
api/main.py
@@ -18,8 +18,8 @@
|
||||
|
||||
"""
|
||||
MediaCrawler WebUI API Server
|
||||
启动命令: uvicorn api.main:app --port 8080 --reload
|
||||
或者: python -m api.main
|
||||
Start command: uvicorn api.main:app --port 8080 --reload
|
||||
Or: python -m api.main
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
@@ -38,15 +38,15 @@ app = FastAPI(
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# 获取 webui 静态文件目录
|
||||
# Get webui static files directory
|
||||
WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui")
|
||||
|
||||
# CORS 配置 - 允许前端开发服务器访问
|
||||
# CORS configuration - allow frontend dev server access
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=[
|
||||
"http://localhost:5173", # Vite dev server
|
||||
"http://localhost:3000", # 备用端口
|
||||
"http://localhost:3000", # Backup port
|
||||
"http://127.0.0.1:5173",
|
||||
"http://127.0.0.1:3000",
|
||||
],
|
||||
@@ -55,7 +55,7 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 注册路由
|
||||
# Register routers
|
||||
app.include_router(crawler_router, prefix="/api")
|
||||
app.include_router(data_router, prefix="/api")
|
||||
app.include_router(websocket_router, prefix="/api")
|
||||
@@ -63,7 +63,7 @@ app.include_router(websocket_router, prefix="/api")
|
||||
|
||||
@app.get("/")
|
||||
async def serve_frontend():
|
||||
"""返回前端页面"""
|
||||
"""Return frontend page"""
|
||||
index_path = os.path.join(WEBUI_DIR, "index.html")
|
||||
if os.path.exists(index_path):
|
||||
return FileResponse(index_path)
|
||||
@@ -82,103 +82,103 @@ async def health_check():
|
||||
|
||||
@app.get("/api/env/check")
|
||||
async def check_environment():
|
||||
"""检测 MediaCrawler 环境是否配置正确"""
|
||||
"""Check if MediaCrawler environment is configured correctly"""
|
||||
try:
|
||||
# 运行 uv run main.py --help 命令检测环境
|
||||
# Run uv run main.py --help command to check environment
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
"uv", "run", "main.py", "--help",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
cwd="." # 项目根目录
|
||||
cwd="." # Project root directory
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=30.0 # 30秒超时
|
||||
timeout=30.0 # 30 seconds timeout
|
||||
)
|
||||
|
||||
if process.returncode == 0:
|
||||
return {
|
||||
"success": True,
|
||||
"message": "MediaCrawler 环境配置正确",
|
||||
"output": stdout.decode("utf-8", errors="ignore")[:500] # 截取前500字符
|
||||
"message": "MediaCrawler environment configured correctly",
|
||||
"output": stdout.decode("utf-8", errors="ignore")[:500] # Truncate to first 500 characters
|
||||
}
|
||||
else:
|
||||
error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore")
|
||||
return {
|
||||
"success": False,
|
||||
"message": "环境检测失败",
|
||||
"message": "Environment check failed",
|
||||
"error": error_msg[:500]
|
||||
}
|
||||
except asyncio.TimeoutError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "环境检测超时",
|
||||
"error": "命令执行超过30秒"
|
||||
"message": "Environment check timeout",
|
||||
"error": "Command execution exceeded 30 seconds"
|
||||
}
|
||||
except FileNotFoundError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "未找到 uv 命令",
|
||||
"error": "请确保已安装 uv 并配置到系统 PATH"
|
||||
"message": "uv command not found",
|
||||
"error": "Please ensure uv is installed and configured in system PATH"
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "环境检测出错",
|
||||
"message": "Environment check error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/config/platforms")
|
||||
async def get_platforms():
|
||||
"""获取支持的平台列表"""
|
||||
"""Get list of supported platforms"""
|
||||
return {
|
||||
"platforms": [
|
||||
{"value": "xhs", "label": "小红书", "icon": "book-open"},
|
||||
{"value": "dy", "label": "抖音", "icon": "music"},
|
||||
{"value": "ks", "label": "快手", "icon": "video"},
|
||||
{"value": "bili", "label": "哔哩哔哩", "icon": "tv"},
|
||||
{"value": "wb", "label": "微博", "icon": "message-circle"},
|
||||
{"value": "tieba", "label": "百度贴吧", "icon": "messages-square"},
|
||||
{"value": "zhihu", "label": "知乎", "icon": "help-circle"},
|
||||
{"value": "xhs", "label": "Xiaohongshu", "icon": "book-open"},
|
||||
{"value": "dy", "label": "Douyin", "icon": "music"},
|
||||
{"value": "ks", "label": "Kuaishou", "icon": "video"},
|
||||
{"value": "bili", "label": "Bilibili", "icon": "tv"},
|
||||
{"value": "wb", "label": "Weibo", "icon": "message-circle"},
|
||||
{"value": "tieba", "label": "Baidu Tieba", "icon": "messages-square"},
|
||||
{"value": "zhihu", "label": "Zhihu", "icon": "help-circle"},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/config/options")
|
||||
async def get_config_options():
|
||||
"""获取所有配置选项"""
|
||||
"""Get all configuration options"""
|
||||
return {
|
||||
"login_types": [
|
||||
{"value": "qrcode", "label": "二维码登录"},
|
||||
{"value": "cookie", "label": "Cookie登录"},
|
||||
{"value": "qrcode", "label": "QR Code Login"},
|
||||
{"value": "cookie", "label": "Cookie Login"},
|
||||
],
|
||||
"crawler_types": [
|
||||
{"value": "search", "label": "搜索模式"},
|
||||
{"value": "detail", "label": "详情模式"},
|
||||
{"value": "creator", "label": "创作者模式"},
|
||||
{"value": "search", "label": "Search Mode"},
|
||||
{"value": "detail", "label": "Detail Mode"},
|
||||
{"value": "creator", "label": "Creator Mode"},
|
||||
],
|
||||
"save_options": [
|
||||
{"value": "json", "label": "JSON 文件"},
|
||||
{"value": "csv", "label": "CSV 文件"},
|
||||
{"value": "excel", "label": "Excel 文件"},
|
||||
{"value": "sqlite", "label": "SQLite 数据库"},
|
||||
{"value": "db", "label": "MySQL 数据库"},
|
||||
{"value": "mongodb", "label": "MongoDB 数据库"},
|
||||
{"value": "json", "label": "JSON File"},
|
||||
{"value": "csv", "label": "CSV File"},
|
||||
{"value": "excel", "label": "Excel File"},
|
||||
{"value": "sqlite", "label": "SQLite Database"},
|
||||
{"value": "db", "label": "MySQL Database"},
|
||||
{"value": "mongodb", "label": "MongoDB Database"},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# 挂载静态资源 - 必须放在所有路由之后
|
||||
# Mount static resources - must be placed after all routes
|
||||
if os.path.exists(WEBUI_DIR):
|
||||
assets_dir = os.path.join(WEBUI_DIR, "assets")
|
||||
if os.path.exists(assets_dir):
|
||||
app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
|
||||
# 挂载 logos 目录
|
||||
# Mount logos directory
|
||||
logos_dir = os.path.join(WEBUI_DIR, "logos")
|
||||
if os.path.exists(logos_dir):
|
||||
app.mount("/logos", StaticFiles(directory=logos_dir), name="logos")
|
||||
# 挂载其他静态文件(如 vite.svg)
|
||||
# Mount other static files (e.g., vite.svg)
|
||||
app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static")
|
||||
|
||||
|
||||
|
||||
@@ -26,10 +26,10 @@ router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
@router.post("/start")
|
||||
async def start_crawler(request: CrawlerStartRequest):
|
||||
"""启动爬虫任务"""
|
||||
"""Start crawler task"""
|
||||
success = await crawler_manager.start(request)
|
||||
if not success:
|
||||
# 处理并发/重复请求:如果进程已经在跑,返回 400 而不是 500
|
||||
# Handle concurrent/duplicate requests: if process is already running, return 400 instead of 500
|
||||
if crawler_manager.process and crawler_manager.process.poll() is None:
|
||||
raise HTTPException(status_code=400, detail="Crawler is already running")
|
||||
raise HTTPException(status_code=500, detail="Failed to start crawler")
|
||||
@@ -39,10 +39,10 @@ async def start_crawler(request: CrawlerStartRequest):
|
||||
|
||||
@router.post("/stop")
|
||||
async def stop_crawler():
|
||||
"""停止爬虫任务"""
|
||||
"""Stop crawler task"""
|
||||
success = await crawler_manager.stop()
|
||||
if not success:
|
||||
# 处理并发/重复请求:如果进程已退出/不存在,返回 400 而不是 500
|
||||
# Handle concurrent/duplicate requests: if process already exited/doesn't exist, return 400 instead of 500
|
||||
if not crawler_manager.process or crawler_manager.process.poll() is not None:
|
||||
raise HTTPException(status_code=400, detail="No crawler is running")
|
||||
raise HTTPException(status_code=500, detail="Failed to stop crawler")
|
||||
@@ -52,12 +52,12 @@ async def stop_crawler():
|
||||
|
||||
@router.get("/status", response_model=CrawlerStatusResponse)
|
||||
async def get_crawler_status():
|
||||
"""获取爬虫状态"""
|
||||
"""Get crawler status"""
|
||||
return crawler_manager.get_status()
|
||||
|
||||
|
||||
@router.get("/logs")
|
||||
async def get_logs(limit: int = 100):
|
||||
"""获取最近的日志"""
|
||||
"""Get recent logs"""
|
||||
logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
|
||||
return {"logs": [log.model_dump() for log in logs]}
|
||||
|
||||
@@ -26,16 +26,16 @@ from fastapi.responses import FileResponse
|
||||
|
||||
router = APIRouter(prefix="/data", tags=["data"])
|
||||
|
||||
# 数据目录
|
||||
# Data directory
|
||||
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||
|
||||
|
||||
def get_file_info(file_path: Path) -> dict:
|
||||
"""获取文件信息"""
|
||||
"""Get file information"""
|
||||
stat = file_path.stat()
|
||||
record_count = None
|
||||
|
||||
# 尝试获取记录数
|
||||
# Try to get record count
|
||||
try:
|
||||
if file_path.suffix == ".json":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
@@ -44,7 +44,7 @@ def get_file_info(file_path: Path) -> dict:
|
||||
record_count = len(data)
|
||||
elif file_path.suffix == ".csv":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
record_count = sum(1 for _ in f) - 1 # 减去标题行
|
||||
record_count = sum(1 for _ in f) - 1 # Subtract header row
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -60,7 +60,7 @@ def get_file_info(file_path: Path) -> dict:
|
||||
|
||||
@router.get("/files")
|
||||
async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
|
||||
"""获取数据文件列表"""
|
||||
"""Get data file list"""
|
||||
if not DATA_DIR.exists():
|
||||
return {"files": []}
|
||||
|
||||
@@ -74,13 +74,13 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
if file_path.suffix.lower() not in supported_extensions:
|
||||
continue
|
||||
|
||||
# 平台过滤
|
||||
# Platform filter
|
||||
if platform:
|
||||
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||
if platform.lower() not in rel_path.lower():
|
||||
continue
|
||||
|
||||
# 类型过滤
|
||||
# Type filter
|
||||
if file_type and file_path.suffix[1:].lower() != file_type.lower():
|
||||
continue
|
||||
|
||||
@@ -89,7 +89,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 按修改时间排序(最新的在前)
|
||||
# Sort by modification time (newest first)
|
||||
files.sort(key=lambda x: x["modified_at"], reverse=True)
|
||||
|
||||
return {"files": files}
|
||||
@@ -97,7 +97,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
|
||||
@router.get("/files/{file_path:path}")
|
||||
async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
|
||||
"""获取文件内容或预览"""
|
||||
"""Get file content or preview"""
|
||||
full_path = DATA_DIR / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
@@ -106,14 +106,14 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
if not full_path.is_file():
|
||||
raise HTTPException(status_code=400, detail="Not a file")
|
||||
|
||||
# 安全检查:确保在 DATA_DIR 内
|
||||
# Security check: ensure within DATA_DIR
|
||||
try:
|
||||
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
|
||||
if preview:
|
||||
# 返回预览数据
|
||||
# Return preview data
|
||||
try:
|
||||
if full_path.suffix == ".json":
|
||||
with open(full_path, "r", encoding="utf-8") as f:
|
||||
@@ -130,18 +130,18 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
if i >= limit:
|
||||
break
|
||||
rows.append(row)
|
||||
# 重新读取获取总数
|
||||
# Re-read to get total count
|
||||
f.seek(0)
|
||||
total = sum(1 for _ in f) - 1
|
||||
return {"data": rows, "total": total}
|
||||
elif full_path.suffix.lower() in (".xlsx", ".xls"):
|
||||
import pandas as pd
|
||||
# 读取前 limit 行
|
||||
# Read first limit rows
|
||||
df = pd.read_excel(full_path, nrows=limit)
|
||||
# 获取总行数(只读取第一列来节省内存)
|
||||
# Get total row count (only read first column to save memory)
|
||||
df_count = pd.read_excel(full_path, usecols=[0])
|
||||
total = len(df_count)
|
||||
# 转换为字典列表,处理 NaN 值
|
||||
# Convert to list of dictionaries, handle NaN values
|
||||
rows = df.where(pd.notnull(df), None).to_dict(orient='records')
|
||||
return {
|
||||
"data": rows,
|
||||
@@ -155,7 +155,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
else:
|
||||
# 返回文件下载
|
||||
# Return file download
|
||||
return FileResponse(
|
||||
path=full_path,
|
||||
filename=full_path.name,
|
||||
@@ -165,7 +165,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
|
||||
@router.get("/download/{file_path:path}")
|
||||
async def download_file(file_path: str):
|
||||
"""下载文件"""
|
||||
"""Download file"""
|
||||
full_path = DATA_DIR / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
@@ -174,7 +174,7 @@ async def download_file(file_path: str):
|
||||
if not full_path.is_file():
|
||||
raise HTTPException(status_code=400, detail="Not a file")
|
||||
|
||||
# 安全检查
|
||||
# Security check
|
||||
try:
|
||||
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||
except ValueError:
|
||||
@@ -189,7 +189,7 @@ async def download_file(file_path: str):
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_data_stats():
|
||||
"""获取数据统计"""
|
||||
"""Get data statistics"""
|
||||
if not DATA_DIR.exists():
|
||||
return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}
|
||||
|
||||
@@ -214,11 +214,11 @@ async def get_data_stats():
|
||||
stats["total_files"] += 1
|
||||
stats["total_size"] += stat.st_size
|
||||
|
||||
# 按类型统计
|
||||
# Statistics by type
|
||||
file_type = file_path.suffix[1:].lower()
|
||||
stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1
|
||||
|
||||
# 按平台统计(从路径推断)
|
||||
# Statistics by platform (inferred from path)
|
||||
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||
for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
|
||||
if platform in rel_path.lower():
|
||||
|
||||
@@ -27,7 +27,7 @@ router = APIRouter(tags=["websocket"])
|
||||
|
||||
|
||||
class ConnectionManager:
|
||||
"""WebSocket 连接管理器"""
|
||||
"""WebSocket connection manager"""
|
||||
|
||||
def __init__(self):
|
||||
self.active_connections: Set[WebSocket] = set()
|
||||
@@ -40,7 +40,7 @@ class ConnectionManager:
|
||||
self.active_connections.discard(websocket)
|
||||
|
||||
async def broadcast(self, message: dict):
|
||||
"""广播消息到所有连接"""
|
||||
"""Broadcast message to all connections"""
|
||||
if not self.active_connections:
|
||||
return
|
||||
|
||||
@@ -51,7 +51,7 @@ class ConnectionManager:
|
||||
except Exception:
|
||||
disconnected.append(connection)
|
||||
|
||||
# 清理断开的连接
|
||||
# Clean up disconnected connections
|
||||
for conn in disconnected:
|
||||
self.disconnect(conn)
|
||||
|
||||
@@ -60,13 +60,13 @@ manager = ConnectionManager()
|
||||
|
||||
|
||||
async def log_broadcaster():
|
||||
"""后台任务:从队列读取日志并广播"""
|
||||
"""Background task: read logs from queue and broadcast"""
|
||||
queue = crawler_manager.get_log_queue()
|
||||
while True:
|
||||
try:
|
||||
# 从队列获取日志条目
|
||||
# Get log entry from queue
|
||||
entry = await queue.get()
|
||||
# 广播到所有 WebSocket 连接
|
||||
# Broadcast to all WebSocket connections
|
||||
await manager.broadcast(entry.model_dump())
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
@@ -75,12 +75,12 @@ async def log_broadcaster():
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
|
||||
# 全局广播任务
|
||||
# Global broadcast task
|
||||
_broadcaster_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
def start_broadcaster():
|
||||
"""启动广播任务"""
|
||||
"""Start broadcast task"""
|
||||
global _broadcaster_task
|
||||
if _broadcaster_task is None or _broadcaster_task.done():
|
||||
_broadcaster_task = asyncio.create_task(log_broadcaster())
|
||||
@@ -88,17 +88,17 @@ def start_broadcaster():
|
||||
|
||||
@router.websocket("/ws/logs")
|
||||
async def websocket_logs(websocket: WebSocket):
|
||||
"""WebSocket 日志流"""
|
||||
"""WebSocket log stream"""
|
||||
print("[WS] New connection attempt")
|
||||
|
||||
try:
|
||||
# 确保广播任务在运行
|
||||
# Ensure broadcast task is running
|
||||
start_broadcaster()
|
||||
|
||||
await manager.connect(websocket)
|
||||
print(f"[WS] Connected, active connections: {len(manager.active_connections)}")
|
||||
|
||||
# 发送现有日志
|
||||
# Send existing logs
|
||||
for log in crawler_manager.logs:
|
||||
try:
|
||||
await websocket.send_json(log.model_dump())
|
||||
@@ -109,7 +109,7 @@ async def websocket_logs(websocket: WebSocket):
|
||||
print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")
|
||||
|
||||
while True:
|
||||
# 保持连接活跃,接收心跳或任意消息
|
||||
# Keep connection alive, receive heartbeat or any message
|
||||
try:
|
||||
data = await asyncio.wait_for(
|
||||
websocket.receive_text(),
|
||||
@@ -118,7 +118,7 @@ async def websocket_logs(websocket: WebSocket):
|
||||
if data == "ping":
|
||||
await websocket.send_text("pong")
|
||||
except asyncio.TimeoutError:
|
||||
# 发送 ping 保持连接
|
||||
# Send ping to keep connection alive
|
||||
try:
|
||||
await websocket.send_text("ping")
|
||||
except Exception as e:
|
||||
@@ -136,12 +136,12 @@ async def websocket_logs(websocket: WebSocket):
|
||||
|
||||
@router.websocket("/ws/status")
|
||||
async def websocket_status(websocket: WebSocket):
|
||||
"""WebSocket 状态流"""
|
||||
"""WebSocket status stream"""
|
||||
await websocket.accept()
|
||||
|
||||
try:
|
||||
while True:
|
||||
# 每秒发送一次状态
|
||||
# Send status every second
|
||||
status = crawler_manager.get_status()
|
||||
await websocket.send_json(status)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
@@ -22,7 +22,7 @@ from pydantic import BaseModel
|
||||
|
||||
|
||||
class PlatformEnum(str, Enum):
|
||||
"""支持的媒体平台"""
|
||||
"""Supported media platforms"""
|
||||
XHS = "xhs"
|
||||
DOUYIN = "dy"
|
||||
KUAISHOU = "ks"
|
||||
@@ -33,21 +33,21 @@ class PlatformEnum(str, Enum):
|
||||
|
||||
|
||||
class LoginTypeEnum(str, Enum):
|
||||
"""登录方式"""
|
||||
"""Login method"""
|
||||
QRCODE = "qrcode"
|
||||
PHONE = "phone"
|
||||
COOKIE = "cookie"
|
||||
|
||||
|
||||
class CrawlerTypeEnum(str, Enum):
|
||||
"""爬虫类型"""
|
||||
"""Crawler type"""
|
||||
SEARCH = "search"
|
||||
DETAIL = "detail"
|
||||
CREATOR = "creator"
|
||||
|
||||
|
||||
class SaveDataOptionEnum(str, Enum):
|
||||
"""数据保存方式"""
|
||||
"""Data save option"""
|
||||
CSV = "csv"
|
||||
DB = "db"
|
||||
JSON = "json"
|
||||
@@ -57,13 +57,13 @@ class SaveDataOptionEnum(str, Enum):
|
||||
|
||||
|
||||
class CrawlerStartRequest(BaseModel):
|
||||
"""启动爬虫请求"""
|
||||
"""Crawler start request"""
|
||||
platform: PlatformEnum
|
||||
login_type: LoginTypeEnum = LoginTypeEnum.QRCODE
|
||||
crawler_type: CrawlerTypeEnum = CrawlerTypeEnum.SEARCH
|
||||
keywords: str = "" # 搜索模式下的关键词
|
||||
specified_ids: str = "" # 详情模式下的帖子/视频ID列表,逗号分隔
|
||||
creator_ids: str = "" # 创作者模式下的创作者ID列表,逗号分隔
|
||||
keywords: str = "" # Keywords for search mode
|
||||
specified_ids: str = "" # Post/video ID list for detail mode, comma-separated
|
||||
creator_ids: str = "" # Creator ID list for creator mode, comma-separated
|
||||
start_page: int = 1
|
||||
enable_comments: bool = True
|
||||
enable_sub_comments: bool = False
|
||||
@@ -73,7 +73,7 @@ class CrawlerStartRequest(BaseModel):
|
||||
|
||||
|
||||
class CrawlerStatusResponse(BaseModel):
|
||||
"""爬虫状态响应"""
|
||||
"""Crawler status response"""
|
||||
status: Literal["idle", "running", "stopping", "error"]
|
||||
platform: Optional[str] = None
|
||||
crawler_type: Optional[str] = None
|
||||
@@ -82,7 +82,7 @@ class CrawlerStatusResponse(BaseModel):
|
||||
|
||||
|
||||
class LogEntry(BaseModel):
|
||||
"""日志条目"""
|
||||
"""Log entry"""
|
||||
id: int
|
||||
timestamp: str
|
||||
level: Literal["info", "warning", "error", "success", "debug"]
|
||||
@@ -90,7 +90,7 @@ class LogEntry(BaseModel):
|
||||
|
||||
|
||||
class DataFileInfo(BaseModel):
|
||||
"""数据文件信息"""
|
||||
"""Data file information"""
|
||||
name: str
|
||||
path: str
|
||||
size: int
|
||||
|
||||
@@ -28,7 +28,7 @@ from ..schemas import CrawlerStartRequest, LogEntry
|
||||
|
||||
|
||||
class CrawlerManager:
|
||||
"""爬虫进程管理器"""
|
||||
"""Crawler process manager"""
|
||||
|
||||
def __init__(self):
|
||||
self._lock = asyncio.Lock()
|
||||
@@ -39,9 +39,9 @@ class CrawlerManager:
|
||||
self._log_id = 0
|
||||
self._logs: List[LogEntry] = []
|
||||
self._read_task: Optional[asyncio.Task] = None
|
||||
# 项目根目录
|
||||
# Project root directory
|
||||
self._project_root = Path(__file__).parent.parent.parent
|
||||
# 日志队列 - 用于向 WebSocket 推送
|
||||
# Log queue - for pushing to WebSocket
|
||||
self._log_queue: Optional[asyncio.Queue] = None
|
||||
|
||||
@property
|
||||
@@ -49,13 +49,13 @@ class CrawlerManager:
|
||||
return self._logs
|
||||
|
||||
def get_log_queue(self) -> asyncio.Queue:
|
||||
"""获取或创建日志队列"""
|
||||
"""Get or create log queue"""
|
||||
if self._log_queue is None:
|
||||
self._log_queue = asyncio.Queue()
|
||||
return self._log_queue
|
||||
|
||||
def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
|
||||
"""创建日志条目"""
|
||||
"""Create log entry"""
|
||||
self._log_id += 1
|
||||
entry = LogEntry(
|
||||
id=self._log_id,
|
||||
@@ -64,13 +64,13 @@ class CrawlerManager:
|
||||
message=message
|
||||
)
|
||||
self._logs.append(entry)
|
||||
# 保留最近 500 条日志
|
||||
# Keep last 500 logs
|
||||
if len(self._logs) > 500:
|
||||
self._logs = self._logs[-500:]
|
||||
return entry
|
||||
|
||||
async def _push_log(self, entry: LogEntry):
|
||||
"""推送日志到队列"""
|
||||
"""Push log to queue"""
|
||||
if self._log_queue is not None:
|
||||
try:
|
||||
self._log_queue.put_nowait(entry)
|
||||
@@ -78,7 +78,7 @@ class CrawlerManager:
|
||||
pass
|
||||
|
||||
def _parse_log_level(self, line: str) -> str:
|
||||
"""解析日志级别"""
|
||||
"""Parse log level"""
|
||||
line_upper = line.upper()
|
||||
if "ERROR" in line_upper or "FAILED" in line_upper:
|
||||
return "error"
|
||||
@@ -91,16 +91,16 @@ class CrawlerManager:
|
||||
return "info"
|
||||
|
||||
async def start(self, config: CrawlerStartRequest) -> bool:
|
||||
"""启动爬虫进程"""
|
||||
"""Start crawler process"""
|
||||
async with self._lock:
|
||||
if self.process and self.process.poll() is None:
|
||||
return False
|
||||
|
||||
# 清空旧日志
|
||||
# Clear old logs
|
||||
self._logs = []
|
||||
self._log_id = 0
|
||||
|
||||
# 清空待推送队列(不要替换对象,避免 WebSocket 广播协程持有旧队列引用)
|
||||
# Clear pending queue (don't replace object to avoid WebSocket broadcast coroutine holding old queue reference)
|
||||
if self._log_queue is None:
|
||||
self._log_queue = asyncio.Queue()
|
||||
else:
|
||||
@@ -110,15 +110,15 @@ class CrawlerManager:
|
||||
except asyncio.QueueEmpty:
|
||||
pass
|
||||
|
||||
# 构建命令行参数
|
||||
# Build command line arguments
|
||||
cmd = self._build_command(config)
|
||||
|
||||
# 记录启动日志
|
||||
# Log start information
|
||||
entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
|
||||
await self._push_log(entry)
|
||||
|
||||
try:
|
||||
# 启动子进程
|
||||
# Start subprocess
|
||||
self.process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
@@ -139,7 +139,7 @@ class CrawlerManager:
|
||||
)
|
||||
await self._push_log(entry)
|
||||
|
||||
# 启动日志读取任务
|
||||
# Start log reading task
|
||||
self._read_task = asyncio.create_task(self._read_output())
|
||||
|
||||
return True
|
||||
@@ -150,7 +150,7 @@ class CrawlerManager:
|
||||
return False
|
||||
|
||||
async def stop(self) -> bool:
|
||||
"""停止爬虫进程"""
|
||||
"""Stop crawler process"""
|
||||
async with self._lock:
|
||||
if not self.process or self.process.poll() is not None:
|
||||
return False
|
||||
@@ -162,13 +162,13 @@ class CrawlerManager:
|
||||
try:
|
||||
self.process.send_signal(signal.SIGTERM)
|
||||
|
||||
# 等待优雅退出 (最多15秒)
|
||||
# Wait for graceful exit (up to 15 seconds)
|
||||
for _ in range(30):
|
||||
if self.process.poll() is not None:
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# 如果还没退出,强制杀死
|
||||
# If still not exited, force kill
|
||||
if self.process.poll() is None:
|
||||
entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
|
||||
await self._push_log(entry)
|
||||
@@ -184,7 +184,7 @@ class CrawlerManager:
|
||||
self.status = "idle"
|
||||
self.current_config = None
|
||||
|
||||
# 取消日志读取任务
|
||||
# Cancel log reading task
|
||||
if self._read_task:
|
||||
self._read_task.cancel()
|
||||
self._read_task = None
|
||||
@@ -192,7 +192,7 @@ class CrawlerManager:
|
||||
return True
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""获取当前状态"""
|
||||
"""Get current status"""
|
||||
return {
|
||||
"status": self.status,
|
||||
"platform": self.current_config.platform.value if self.current_config else None,
|
||||
@@ -202,7 +202,7 @@ class CrawlerManager:
|
||||
}
|
||||
|
||||
def _build_command(self, config: CrawlerStartRequest) -> list:
|
||||
"""构建 main.py 命令行参数"""
|
||||
"""Build main.py command line arguments"""
|
||||
cmd = ["uv", "run", "python", "main.py"]
|
||||
|
||||
cmd.extend(["--platform", config.platform.value])
|
||||
@@ -210,7 +210,7 @@ class CrawlerManager:
|
||||
cmd.extend(["--type", config.crawler_type.value])
|
||||
cmd.extend(["--save_data_option", config.save_option.value])
|
||||
|
||||
# 根据爬虫类型传递不同的参数
|
||||
# Pass different arguments based on crawler type
|
||||
if config.crawler_type.value == "search" and config.keywords:
|
||||
cmd.extend(["--keywords", config.keywords])
|
||||
elif config.crawler_type.value == "detail" and config.specified_ids:
|
||||
@@ -232,12 +232,12 @@ class CrawlerManager:
|
||||
return cmd
|
||||
|
||||
async def _read_output(self):
|
||||
"""异步读取进程输出"""
|
||||
"""Asynchronously read process output"""
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
try:
|
||||
while self.process and self.process.poll() is None:
|
||||
# 在线程池中读取一行
|
||||
# Read a line in thread pool
|
||||
line = await loop.run_in_executor(
|
||||
None, self.process.stdout.readline
|
||||
)
|
||||
@@ -248,7 +248,7 @@ class CrawlerManager:
|
||||
entry = self._create_log_entry(line, level)
|
||||
await self._push_log(entry)
|
||||
|
||||
# 读取剩余输出
|
||||
# Read remaining output
|
||||
if self.process and self.process.stdout:
|
||||
remaining = await loop.run_in_executor(
|
||||
None, self.process.stdout.read
|
||||
@@ -260,7 +260,7 @@ class CrawlerManager:
|
||||
entry = self._create_log_entry(line.strip(), level)
|
||||
await self._push_log(entry)
|
||||
|
||||
# 进程结束
|
||||
# Process ended
|
||||
if self.status == "running":
|
||||
exit_code = self.process.returncode if self.process else -1
|
||||
if exit_code == 0:
|
||||
@@ -277,5 +277,5 @@ class CrawlerManager:
|
||||
await self._push_log(entry)
|
||||
|
||||
|
||||
# 全局单例
|
||||
# Global singleton
|
||||
crawler_manager = CrawlerManager()
|
||||
|
||||
Reference in New Issue
Block a user