mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-09 03:57:43 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -26,10 +26,10 @@ router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
@router.post("/start")
|
||||
async def start_crawler(request: CrawlerStartRequest):
|
||||
"""启动爬虫任务"""
|
||||
"""Start crawler task"""
|
||||
success = await crawler_manager.start(request)
|
||||
if not success:
|
||||
# 处理并发/重复请求:如果进程已经在跑,返回 400 而不是 500
|
||||
# Handle concurrent/duplicate requests: if process is already running, return 400 instead of 500
|
||||
if crawler_manager.process and crawler_manager.process.poll() is None:
|
||||
raise HTTPException(status_code=400, detail="Crawler is already running")
|
||||
raise HTTPException(status_code=500, detail="Failed to start crawler")
|
||||
@@ -39,10 +39,10 @@ async def start_crawler(request: CrawlerStartRequest):
|
||||
|
||||
@router.post("/stop")
|
||||
async def stop_crawler():
|
||||
"""停止爬虫任务"""
|
||||
"""Stop crawler task"""
|
||||
success = await crawler_manager.stop()
|
||||
if not success:
|
||||
# 处理并发/重复请求:如果进程已退出/不存在,返回 400 而不是 500
|
||||
# Handle concurrent/duplicate requests: if process already exited/doesn't exist, return 400 instead of 500
|
||||
if not crawler_manager.process or crawler_manager.process.poll() is not None:
|
||||
raise HTTPException(status_code=400, detail="No crawler is running")
|
||||
raise HTTPException(status_code=500, detail="Failed to stop crawler")
|
||||
@@ -52,12 +52,12 @@ async def stop_crawler():
|
||||
|
||||
@router.get("/status", response_model=CrawlerStatusResponse)
|
||||
async def get_crawler_status():
|
||||
"""获取爬虫状态"""
|
||||
"""Get crawler status"""
|
||||
return crawler_manager.get_status()
|
||||
|
||||
|
||||
@router.get("/logs")
|
||||
async def get_logs(limit: int = 100):
|
||||
"""获取最近的日志"""
|
||||
"""Get recent logs"""
|
||||
logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
|
||||
return {"logs": [log.model_dump() for log in logs]}
|
||||
|
||||
@@ -26,16 +26,16 @@ from fastapi.responses import FileResponse
|
||||
|
||||
router = APIRouter(prefix="/data", tags=["data"])
|
||||
|
||||
# 数据目录
|
||||
# Data directory
|
||||
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||
|
||||
|
||||
def get_file_info(file_path: Path) -> dict:
|
||||
"""获取文件信息"""
|
||||
"""Get file information"""
|
||||
stat = file_path.stat()
|
||||
record_count = None
|
||||
|
||||
# 尝试获取记录数
|
||||
# Try to get record count
|
||||
try:
|
||||
if file_path.suffix == ".json":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
@@ -44,7 +44,7 @@ def get_file_info(file_path: Path) -> dict:
|
||||
record_count = len(data)
|
||||
elif file_path.suffix == ".csv":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
record_count = sum(1 for _ in f) - 1 # 减去标题行
|
||||
record_count = sum(1 for _ in f) - 1 # Subtract header row
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -60,7 +60,7 @@ def get_file_info(file_path: Path) -> dict:
|
||||
|
||||
@router.get("/files")
|
||||
async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
|
||||
"""获取数据文件列表"""
|
||||
"""Get data file list"""
|
||||
if not DATA_DIR.exists():
|
||||
return {"files": []}
|
||||
|
||||
@@ -74,13 +74,13 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
if file_path.suffix.lower() not in supported_extensions:
|
||||
continue
|
||||
|
||||
# 平台过滤
|
||||
# Platform filter
|
||||
if platform:
|
||||
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||
if platform.lower() not in rel_path.lower():
|
||||
continue
|
||||
|
||||
# 类型过滤
|
||||
# Type filter
|
||||
if file_type and file_path.suffix[1:].lower() != file_type.lower():
|
||||
continue
|
||||
|
||||
@@ -89,7 +89,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 按修改时间排序(最新的在前)
|
||||
# Sort by modification time (newest first)
|
||||
files.sort(key=lambda x: x["modified_at"], reverse=True)
|
||||
|
||||
return {"files": files}
|
||||
@@ -97,7 +97,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
|
||||
@router.get("/files/{file_path:path}")
|
||||
async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
|
||||
"""获取文件内容或预览"""
|
||||
"""Get file content or preview"""
|
||||
full_path = DATA_DIR / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
@@ -106,14 +106,14 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
if not full_path.is_file():
|
||||
raise HTTPException(status_code=400, detail="Not a file")
|
||||
|
||||
# 安全检查:确保在 DATA_DIR 内
|
||||
# Security check: ensure within DATA_DIR
|
||||
try:
|
||||
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
|
||||
if preview:
|
||||
# 返回预览数据
|
||||
# Return preview data
|
||||
try:
|
||||
if full_path.suffix == ".json":
|
||||
with open(full_path, "r", encoding="utf-8") as f:
|
||||
@@ -130,18 +130,18 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
if i >= limit:
|
||||
break
|
||||
rows.append(row)
|
||||
# 重新读取获取总数
|
||||
# Re-read to get total count
|
||||
f.seek(0)
|
||||
total = sum(1 for _ in f) - 1
|
||||
return {"data": rows, "total": total}
|
||||
elif full_path.suffix.lower() in (".xlsx", ".xls"):
|
||||
import pandas as pd
|
||||
# 读取前 limit 行
|
||||
# Read first limit rows
|
||||
df = pd.read_excel(full_path, nrows=limit)
|
||||
# 获取总行数(只读取第一列来节省内存)
|
||||
# Get total row count (only read first column to save memory)
|
||||
df_count = pd.read_excel(full_path, usecols=[0])
|
||||
total = len(df_count)
|
||||
# 转换为字典列表,处理 NaN 值
|
||||
# Convert to list of dictionaries, handle NaN values
|
||||
rows = df.where(pd.notnull(df), None).to_dict(orient='records')
|
||||
return {
|
||||
"data": rows,
|
||||
@@ -155,7 +155,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
else:
|
||||
# 返回文件下载
|
||||
# Return file download
|
||||
return FileResponse(
|
||||
path=full_path,
|
||||
filename=full_path.name,
|
||||
@@ -165,7 +165,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
|
||||
@router.get("/download/{file_path:path}")
|
||||
async def download_file(file_path: str):
|
||||
"""下载文件"""
|
||||
"""Download file"""
|
||||
full_path = DATA_DIR / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
@@ -174,7 +174,7 @@ async def download_file(file_path: str):
|
||||
if not full_path.is_file():
|
||||
raise HTTPException(status_code=400, detail="Not a file")
|
||||
|
||||
# 安全检查
|
||||
# Security check
|
||||
try:
|
||||
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||
except ValueError:
|
||||
@@ -189,7 +189,7 @@ async def download_file(file_path: str):
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_data_stats():
|
||||
"""获取数据统计"""
|
||||
"""Get data statistics"""
|
||||
if not DATA_DIR.exists():
|
||||
return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}
|
||||
|
||||
@@ -214,11 +214,11 @@ async def get_data_stats():
|
||||
stats["total_files"] += 1
|
||||
stats["total_size"] += stat.st_size
|
||||
|
||||
# 按类型统计
|
||||
# Statistics by type
|
||||
file_type = file_path.suffix[1:].lower()
|
||||
stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1
|
||||
|
||||
# 按平台统计(从路径推断)
|
||||
# Statistics by platform (inferred from path)
|
||||
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||
for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
|
||||
if platform in rel_path.lower():
|
||||
|
||||
@@ -27,7 +27,7 @@ router = APIRouter(tags=["websocket"])
|
||||
|
||||
|
||||
class ConnectionManager:
|
||||
"""WebSocket 连接管理器"""
|
||||
"""WebSocket connection manager"""
|
||||
|
||||
def __init__(self):
|
||||
self.active_connections: Set[WebSocket] = set()
|
||||
@@ -40,7 +40,7 @@ class ConnectionManager:
|
||||
self.active_connections.discard(websocket)
|
||||
|
||||
async def broadcast(self, message: dict):
|
||||
"""广播消息到所有连接"""
|
||||
"""Broadcast message to all connections"""
|
||||
if not self.active_connections:
|
||||
return
|
||||
|
||||
@@ -51,7 +51,7 @@ class ConnectionManager:
|
||||
except Exception:
|
||||
disconnected.append(connection)
|
||||
|
||||
# 清理断开的连接
|
||||
# Clean up disconnected connections
|
||||
for conn in disconnected:
|
||||
self.disconnect(conn)
|
||||
|
||||
@@ -60,13 +60,13 @@ manager = ConnectionManager()
|
||||
|
||||
|
||||
async def log_broadcaster():
|
||||
"""后台任务:从队列读取日志并广播"""
|
||||
"""Background task: read logs from queue and broadcast"""
|
||||
queue = crawler_manager.get_log_queue()
|
||||
while True:
|
||||
try:
|
||||
# 从队列获取日志条目
|
||||
# Get log entry from queue
|
||||
entry = await queue.get()
|
||||
# 广播到所有 WebSocket 连接
|
||||
# Broadcast to all WebSocket connections
|
||||
await manager.broadcast(entry.model_dump())
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
@@ -75,12 +75,12 @@ async def log_broadcaster():
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
|
||||
# 全局广播任务
|
||||
# Global broadcast task
|
||||
_broadcaster_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
def start_broadcaster():
|
||||
"""启动广播任务"""
|
||||
"""Start broadcast task"""
|
||||
global _broadcaster_task
|
||||
if _broadcaster_task is None or _broadcaster_task.done():
|
||||
_broadcaster_task = asyncio.create_task(log_broadcaster())
|
||||
@@ -88,17 +88,17 @@ def start_broadcaster():
|
||||
|
||||
@router.websocket("/ws/logs")
|
||||
async def websocket_logs(websocket: WebSocket):
|
||||
"""WebSocket 日志流"""
|
||||
"""WebSocket log stream"""
|
||||
print("[WS] New connection attempt")
|
||||
|
||||
try:
|
||||
# 确保广播任务在运行
|
||||
# Ensure broadcast task is running
|
||||
start_broadcaster()
|
||||
|
||||
await manager.connect(websocket)
|
||||
print(f"[WS] Connected, active connections: {len(manager.active_connections)}")
|
||||
|
||||
# 发送现有日志
|
||||
# Send existing logs
|
||||
for log in crawler_manager.logs:
|
||||
try:
|
||||
await websocket.send_json(log.model_dump())
|
||||
@@ -109,7 +109,7 @@ async def websocket_logs(websocket: WebSocket):
|
||||
print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")
|
||||
|
||||
while True:
|
||||
# 保持连接活跃,接收心跳或任意消息
|
||||
# Keep connection alive, receive heartbeat or any message
|
||||
try:
|
||||
data = await asyncio.wait_for(
|
||||
websocket.receive_text(),
|
||||
@@ -118,7 +118,7 @@ async def websocket_logs(websocket: WebSocket):
|
||||
if data == "ping":
|
||||
await websocket.send_text("pong")
|
||||
except asyncio.TimeoutError:
|
||||
# 发送 ping 保持连接
|
||||
# Send ping to keep connection alive
|
||||
try:
|
||||
await websocket.send_text("ping")
|
||||
except Exception as e:
|
||||
@@ -136,12 +136,12 @@ async def websocket_logs(websocket: WebSocket):
|
||||
|
||||
@router.websocket("/ws/status")
|
||||
async def websocket_status(websocket: WebSocket):
|
||||
"""WebSocket 状态流"""
|
||||
"""WebSocket status stream"""
|
||||
await websocket.accept()
|
||||
|
||||
try:
|
||||
while True:
|
||||
# 每秒发送一次状态
|
||||
# Send status every second
|
||||
status = crawler_manager.get_status()
|
||||
await websocket.send_json(status)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
Reference in New Issue
Block a user