mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 11:27:26 +08:00
i18n: translate all Chinese comments, docstrings, and logger messages to English
Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
84
api/main.py
84
api/main.py
@@ -18,8 +18,8 @@
|
||||
|
||||
"""
|
||||
MediaCrawler WebUI API Server
|
||||
启动命令: uvicorn api.main:app --port 8080 --reload
|
||||
或者: python -m api.main
|
||||
Start command: uvicorn api.main:app --port 8080 --reload
|
||||
Or: python -m api.main
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
@@ -38,15 +38,15 @@ app = FastAPI(
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# 获取 webui 静态文件目录
|
||||
# Get webui static files directory
|
||||
WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui")
|
||||
|
||||
# CORS 配置 - 允许前端开发服务器访问
|
||||
# CORS configuration - allow frontend dev server access
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=[
|
||||
"http://localhost:5173", # Vite dev server
|
||||
"http://localhost:3000", # 备用端口
|
||||
"http://localhost:3000", # Backup port
|
||||
"http://127.0.0.1:5173",
|
||||
"http://127.0.0.1:3000",
|
||||
],
|
||||
@@ -55,7 +55,7 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 注册路由
|
||||
# Register routers
|
||||
app.include_router(crawler_router, prefix="/api")
|
||||
app.include_router(data_router, prefix="/api")
|
||||
app.include_router(websocket_router, prefix="/api")
|
||||
@@ -63,7 +63,7 @@ app.include_router(websocket_router, prefix="/api")
|
||||
|
||||
@app.get("/")
|
||||
async def serve_frontend():
|
||||
"""返回前端页面"""
|
||||
"""Return frontend page"""
|
||||
index_path = os.path.join(WEBUI_DIR, "index.html")
|
||||
if os.path.exists(index_path):
|
||||
return FileResponse(index_path)
|
||||
@@ -82,103 +82,103 @@ async def health_check():
|
||||
|
||||
@app.get("/api/env/check")
|
||||
async def check_environment():
|
||||
"""检测 MediaCrawler 环境是否配置正确"""
|
||||
"""Check if MediaCrawler environment is configured correctly"""
|
||||
try:
|
||||
# 运行 uv run main.py --help 命令检测环境
|
||||
# Run uv run main.py --help command to check environment
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
"uv", "run", "main.py", "--help",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
cwd="." # 项目根目录
|
||||
cwd="." # Project root directory
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=30.0 # 30秒超时
|
||||
timeout=30.0 # 30 seconds timeout
|
||||
)
|
||||
|
||||
if process.returncode == 0:
|
||||
return {
|
||||
"success": True,
|
||||
"message": "MediaCrawler 环境配置正确",
|
||||
"output": stdout.decode("utf-8", errors="ignore")[:500] # 截取前500字符
|
||||
"message": "MediaCrawler environment configured correctly",
|
||||
"output": stdout.decode("utf-8", errors="ignore")[:500] # Truncate to first 500 characters
|
||||
}
|
||||
else:
|
||||
error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore")
|
||||
return {
|
||||
"success": False,
|
||||
"message": "环境检测失败",
|
||||
"message": "Environment check failed",
|
||||
"error": error_msg[:500]
|
||||
}
|
||||
except asyncio.TimeoutError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "环境检测超时",
|
||||
"error": "命令执行超过30秒"
|
||||
"message": "Environment check timeout",
|
||||
"error": "Command execution exceeded 30 seconds"
|
||||
}
|
||||
except FileNotFoundError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "未找到 uv 命令",
|
||||
"error": "请确保已安装 uv 并配置到系统 PATH"
|
||||
"message": "uv command not found",
|
||||
"error": "Please ensure uv is installed and configured in system PATH"
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "环境检测出错",
|
||||
"message": "Environment check error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/config/platforms")
|
||||
async def get_platforms():
|
||||
"""获取支持的平台列表"""
|
||||
"""Get list of supported platforms"""
|
||||
return {
|
||||
"platforms": [
|
||||
{"value": "xhs", "label": "小红书", "icon": "book-open"},
|
||||
{"value": "dy", "label": "抖音", "icon": "music"},
|
||||
{"value": "ks", "label": "快手", "icon": "video"},
|
||||
{"value": "bili", "label": "哔哩哔哩", "icon": "tv"},
|
||||
{"value": "wb", "label": "微博", "icon": "message-circle"},
|
||||
{"value": "tieba", "label": "百度贴吧", "icon": "messages-square"},
|
||||
{"value": "zhihu", "label": "知乎", "icon": "help-circle"},
|
||||
{"value": "xhs", "label": "Xiaohongshu", "icon": "book-open"},
|
||||
{"value": "dy", "label": "Douyin", "icon": "music"},
|
||||
{"value": "ks", "label": "Kuaishou", "icon": "video"},
|
||||
{"value": "bili", "label": "Bilibili", "icon": "tv"},
|
||||
{"value": "wb", "label": "Weibo", "icon": "message-circle"},
|
||||
{"value": "tieba", "label": "Baidu Tieba", "icon": "messages-square"},
|
||||
{"value": "zhihu", "label": "Zhihu", "icon": "help-circle"},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/config/options")
|
||||
async def get_config_options():
|
||||
"""获取所有配置选项"""
|
||||
"""Get all configuration options"""
|
||||
return {
|
||||
"login_types": [
|
||||
{"value": "qrcode", "label": "二维码登录"},
|
||||
{"value": "cookie", "label": "Cookie登录"},
|
||||
{"value": "qrcode", "label": "QR Code Login"},
|
||||
{"value": "cookie", "label": "Cookie Login"},
|
||||
],
|
||||
"crawler_types": [
|
||||
{"value": "search", "label": "搜索模式"},
|
||||
{"value": "detail", "label": "详情模式"},
|
||||
{"value": "creator", "label": "创作者模式"},
|
||||
{"value": "search", "label": "Search Mode"},
|
||||
{"value": "detail", "label": "Detail Mode"},
|
||||
{"value": "creator", "label": "Creator Mode"},
|
||||
],
|
||||
"save_options": [
|
||||
{"value": "json", "label": "JSON 文件"},
|
||||
{"value": "csv", "label": "CSV 文件"},
|
||||
{"value": "excel", "label": "Excel 文件"},
|
||||
{"value": "sqlite", "label": "SQLite 数据库"},
|
||||
{"value": "db", "label": "MySQL 数据库"},
|
||||
{"value": "mongodb", "label": "MongoDB 数据库"},
|
||||
{"value": "json", "label": "JSON File"},
|
||||
{"value": "csv", "label": "CSV File"},
|
||||
{"value": "excel", "label": "Excel File"},
|
||||
{"value": "sqlite", "label": "SQLite Database"},
|
||||
{"value": "db", "label": "MySQL Database"},
|
||||
{"value": "mongodb", "label": "MongoDB Database"},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# 挂载静态资源 - 必须放在所有路由之后
|
||||
# Mount static resources - must be placed after all routes
|
||||
if os.path.exists(WEBUI_DIR):
|
||||
assets_dir = os.path.join(WEBUI_DIR, "assets")
|
||||
if os.path.exists(assets_dir):
|
||||
app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
|
||||
# 挂载 logos 目录
|
||||
# Mount logos directory
|
||||
logos_dir = os.path.join(WEBUI_DIR, "logos")
|
||||
if os.path.exists(logos_dir):
|
||||
app.mount("/logos", StaticFiles(directory=logos_dir), name="logos")
|
||||
# 挂载其他静态文件(如 vite.svg)
|
||||
# Mount other static files (e.g., vite.svg)
|
||||
app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static")
|
||||
|
||||
|
||||
|
||||
@@ -26,10 +26,10 @@ router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
@router.post("/start")
|
||||
async def start_crawler(request: CrawlerStartRequest):
|
||||
"""启动爬虫任务"""
|
||||
"""Start crawler task"""
|
||||
success = await crawler_manager.start(request)
|
||||
if not success:
|
||||
# 处理并发/重复请求:如果进程已经在跑,返回 400 而不是 500
|
||||
# Handle concurrent/duplicate requests: if process is already running, return 400 instead of 500
|
||||
if crawler_manager.process and crawler_manager.process.poll() is None:
|
||||
raise HTTPException(status_code=400, detail="Crawler is already running")
|
||||
raise HTTPException(status_code=500, detail="Failed to start crawler")
|
||||
@@ -39,10 +39,10 @@ async def start_crawler(request: CrawlerStartRequest):
|
||||
|
||||
@router.post("/stop")
|
||||
async def stop_crawler():
|
||||
"""停止爬虫任务"""
|
||||
"""Stop crawler task"""
|
||||
success = await crawler_manager.stop()
|
||||
if not success:
|
||||
# 处理并发/重复请求:如果进程已退出/不存在,返回 400 而不是 500
|
||||
# Handle concurrent/duplicate requests: if process already exited/doesn't exist, return 400 instead of 500
|
||||
if not crawler_manager.process or crawler_manager.process.poll() is not None:
|
||||
raise HTTPException(status_code=400, detail="No crawler is running")
|
||||
raise HTTPException(status_code=500, detail="Failed to stop crawler")
|
||||
@@ -52,12 +52,12 @@ async def stop_crawler():
|
||||
|
||||
@router.get("/status", response_model=CrawlerStatusResponse)
|
||||
async def get_crawler_status():
|
||||
"""获取爬虫状态"""
|
||||
"""Get crawler status"""
|
||||
return crawler_manager.get_status()
|
||||
|
||||
|
||||
@router.get("/logs")
|
||||
async def get_logs(limit: int = 100):
|
||||
"""获取最近的日志"""
|
||||
"""Get recent logs"""
|
||||
logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
|
||||
return {"logs": [log.model_dump() for log in logs]}
|
||||
|
||||
@@ -26,16 +26,16 @@ from fastapi.responses import FileResponse
|
||||
|
||||
router = APIRouter(prefix="/data", tags=["data"])
|
||||
|
||||
# 数据目录
|
||||
# Data directory
|
||||
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||
|
||||
|
||||
def get_file_info(file_path: Path) -> dict:
|
||||
"""获取文件信息"""
|
||||
"""Get file information"""
|
||||
stat = file_path.stat()
|
||||
record_count = None
|
||||
|
||||
# 尝试获取记录数
|
||||
# Try to get record count
|
||||
try:
|
||||
if file_path.suffix == ".json":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
@@ -44,7 +44,7 @@ def get_file_info(file_path: Path) -> dict:
|
||||
record_count = len(data)
|
||||
elif file_path.suffix == ".csv":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
record_count = sum(1 for _ in f) - 1 # 减去标题行
|
||||
record_count = sum(1 for _ in f) - 1 # Subtract header row
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -60,7 +60,7 @@ def get_file_info(file_path: Path) -> dict:
|
||||
|
||||
@router.get("/files")
|
||||
async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
|
||||
"""获取数据文件列表"""
|
||||
"""Get data file list"""
|
||||
if not DATA_DIR.exists():
|
||||
return {"files": []}
|
||||
|
||||
@@ -74,13 +74,13 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
if file_path.suffix.lower() not in supported_extensions:
|
||||
continue
|
||||
|
||||
# 平台过滤
|
||||
# Platform filter
|
||||
if platform:
|
||||
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||
if platform.lower() not in rel_path.lower():
|
||||
continue
|
||||
|
||||
# 类型过滤
|
||||
# Type filter
|
||||
if file_type and file_path.suffix[1:].lower() != file_type.lower():
|
||||
continue
|
||||
|
||||
@@ -89,7 +89,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 按修改时间排序(最新的在前)
|
||||
# Sort by modification time (newest first)
|
||||
files.sort(key=lambda x: x["modified_at"], reverse=True)
|
||||
|
||||
return {"files": files}
|
||||
@@ -97,7 +97,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
|
||||
|
||||
@router.get("/files/{file_path:path}")
|
||||
async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
|
||||
"""获取文件内容或预览"""
|
||||
"""Get file content or preview"""
|
||||
full_path = DATA_DIR / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
@@ -106,14 +106,14 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
if not full_path.is_file():
|
||||
raise HTTPException(status_code=400, detail="Not a file")
|
||||
|
||||
# 安全检查:确保在 DATA_DIR 内
|
||||
# Security check: ensure within DATA_DIR
|
||||
try:
|
||||
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
|
||||
if preview:
|
||||
# 返回预览数据
|
||||
# Return preview data
|
||||
try:
|
||||
if full_path.suffix == ".json":
|
||||
with open(full_path, "r", encoding="utf-8") as f:
|
||||
@@ -130,18 +130,18 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
if i >= limit:
|
||||
break
|
||||
rows.append(row)
|
||||
# 重新读取获取总数
|
||||
# Re-read to get total count
|
||||
f.seek(0)
|
||||
total = sum(1 for _ in f) - 1
|
||||
return {"data": rows, "total": total}
|
||||
elif full_path.suffix.lower() in (".xlsx", ".xls"):
|
||||
import pandas as pd
|
||||
# 读取前 limit 行
|
||||
# Read first limit rows
|
||||
df = pd.read_excel(full_path, nrows=limit)
|
||||
# 获取总行数(只读取第一列来节省内存)
|
||||
# Get total row count (only read first column to save memory)
|
||||
df_count = pd.read_excel(full_path, usecols=[0])
|
||||
total = len(df_count)
|
||||
# 转换为字典列表,处理 NaN 值
|
||||
# Convert to list of dictionaries, handle NaN values
|
||||
rows = df.where(pd.notnull(df), None).to_dict(orient='records')
|
||||
return {
|
||||
"data": rows,
|
||||
@@ -155,7 +155,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
else:
|
||||
# 返回文件下载
|
||||
# Return file download
|
||||
return FileResponse(
|
||||
path=full_path,
|
||||
filename=full_path.name,
|
||||
@@ -165,7 +165,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
|
||||
|
||||
@router.get("/download/{file_path:path}")
|
||||
async def download_file(file_path: str):
|
||||
"""下载文件"""
|
||||
"""Download file"""
|
||||
full_path = DATA_DIR / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
@@ -174,7 +174,7 @@ async def download_file(file_path: str):
|
||||
if not full_path.is_file():
|
||||
raise HTTPException(status_code=400, detail="Not a file")
|
||||
|
||||
# 安全检查
|
||||
# Security check
|
||||
try:
|
||||
full_path.resolve().relative_to(DATA_DIR.resolve())
|
||||
except ValueError:
|
||||
@@ -189,7 +189,7 @@ async def download_file(file_path: str):
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_data_stats():
|
||||
"""获取数据统计"""
|
||||
"""Get data statistics"""
|
||||
if not DATA_DIR.exists():
|
||||
return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}
|
||||
|
||||
@@ -214,11 +214,11 @@ async def get_data_stats():
|
||||
stats["total_files"] += 1
|
||||
stats["total_size"] += stat.st_size
|
||||
|
||||
# 按类型统计
|
||||
# Statistics by type
|
||||
file_type = file_path.suffix[1:].lower()
|
||||
stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1
|
||||
|
||||
# 按平台统计(从路径推断)
|
||||
# Statistics by platform (inferred from path)
|
||||
rel_path = str(file_path.relative_to(DATA_DIR))
|
||||
for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
|
||||
if platform in rel_path.lower():
|
||||
|
||||
@@ -27,7 +27,7 @@ router = APIRouter(tags=["websocket"])
|
||||
|
||||
|
||||
class ConnectionManager:
|
||||
"""WebSocket 连接管理器"""
|
||||
"""WebSocket connection manager"""
|
||||
|
||||
def __init__(self):
|
||||
self.active_connections: Set[WebSocket] = set()
|
||||
@@ -40,7 +40,7 @@ class ConnectionManager:
|
||||
self.active_connections.discard(websocket)
|
||||
|
||||
async def broadcast(self, message: dict):
|
||||
"""广播消息到所有连接"""
|
||||
"""Broadcast message to all connections"""
|
||||
if not self.active_connections:
|
||||
return
|
||||
|
||||
@@ -51,7 +51,7 @@ class ConnectionManager:
|
||||
except Exception:
|
||||
disconnected.append(connection)
|
||||
|
||||
# 清理断开的连接
|
||||
# Clean up disconnected connections
|
||||
for conn in disconnected:
|
||||
self.disconnect(conn)
|
||||
|
||||
@@ -60,13 +60,13 @@ manager = ConnectionManager()
|
||||
|
||||
|
||||
async def log_broadcaster():
|
||||
"""后台任务:从队列读取日志并广播"""
|
||||
"""Background task: read logs from queue and broadcast"""
|
||||
queue = crawler_manager.get_log_queue()
|
||||
while True:
|
||||
try:
|
||||
# 从队列获取日志条目
|
||||
# Get log entry from queue
|
||||
entry = await queue.get()
|
||||
# 广播到所有 WebSocket 连接
|
||||
# Broadcast to all WebSocket connections
|
||||
await manager.broadcast(entry.model_dump())
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
@@ -75,12 +75,12 @@ async def log_broadcaster():
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
|
||||
# 全局广播任务
|
||||
# Global broadcast task
|
||||
_broadcaster_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
def start_broadcaster():
|
||||
"""启动广播任务"""
|
||||
"""Start broadcast task"""
|
||||
global _broadcaster_task
|
||||
if _broadcaster_task is None or _broadcaster_task.done():
|
||||
_broadcaster_task = asyncio.create_task(log_broadcaster())
|
||||
@@ -88,17 +88,17 @@ def start_broadcaster():
|
||||
|
||||
@router.websocket("/ws/logs")
|
||||
async def websocket_logs(websocket: WebSocket):
|
||||
"""WebSocket 日志流"""
|
||||
"""WebSocket log stream"""
|
||||
print("[WS] New connection attempt")
|
||||
|
||||
try:
|
||||
# 确保广播任务在运行
|
||||
# Ensure broadcast task is running
|
||||
start_broadcaster()
|
||||
|
||||
await manager.connect(websocket)
|
||||
print(f"[WS] Connected, active connections: {len(manager.active_connections)}")
|
||||
|
||||
# 发送现有日志
|
||||
# Send existing logs
|
||||
for log in crawler_manager.logs:
|
||||
try:
|
||||
await websocket.send_json(log.model_dump())
|
||||
@@ -109,7 +109,7 @@ async def websocket_logs(websocket: WebSocket):
|
||||
print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")
|
||||
|
||||
while True:
|
||||
# 保持连接活跃,接收心跳或任意消息
|
||||
# Keep connection alive, receive heartbeat or any message
|
||||
try:
|
||||
data = await asyncio.wait_for(
|
||||
websocket.receive_text(),
|
||||
@@ -118,7 +118,7 @@ async def websocket_logs(websocket: WebSocket):
|
||||
if data == "ping":
|
||||
await websocket.send_text("pong")
|
||||
except asyncio.TimeoutError:
|
||||
# 发送 ping 保持连接
|
||||
# Send ping to keep connection alive
|
||||
try:
|
||||
await websocket.send_text("ping")
|
||||
except Exception as e:
|
||||
@@ -136,12 +136,12 @@ async def websocket_logs(websocket: WebSocket):
|
||||
|
||||
@router.websocket("/ws/status")
|
||||
async def websocket_status(websocket: WebSocket):
|
||||
"""WebSocket 状态流"""
|
||||
"""WebSocket status stream"""
|
||||
await websocket.accept()
|
||||
|
||||
try:
|
||||
while True:
|
||||
# 每秒发送一次状态
|
||||
# Send status every second
|
||||
status = crawler_manager.get_status()
|
||||
await websocket.send_json(status)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
@@ -22,7 +22,7 @@ from pydantic import BaseModel
|
||||
|
||||
|
||||
class PlatformEnum(str, Enum):
|
||||
"""支持的媒体平台"""
|
||||
"""Supported media platforms"""
|
||||
XHS = "xhs"
|
||||
DOUYIN = "dy"
|
||||
KUAISHOU = "ks"
|
||||
@@ -33,21 +33,21 @@ class PlatformEnum(str, Enum):
|
||||
|
||||
|
||||
class LoginTypeEnum(str, Enum):
|
||||
"""登录方式"""
|
||||
"""Login method"""
|
||||
QRCODE = "qrcode"
|
||||
PHONE = "phone"
|
||||
COOKIE = "cookie"
|
||||
|
||||
|
||||
class CrawlerTypeEnum(str, Enum):
|
||||
"""爬虫类型"""
|
||||
"""Crawler type"""
|
||||
SEARCH = "search"
|
||||
DETAIL = "detail"
|
||||
CREATOR = "creator"
|
||||
|
||||
|
||||
class SaveDataOptionEnum(str, Enum):
|
||||
"""数据保存方式"""
|
||||
"""Data save option"""
|
||||
CSV = "csv"
|
||||
DB = "db"
|
||||
JSON = "json"
|
||||
@@ -57,13 +57,13 @@ class SaveDataOptionEnum(str, Enum):
|
||||
|
||||
|
||||
class CrawlerStartRequest(BaseModel):
|
||||
"""启动爬虫请求"""
|
||||
"""Crawler start request"""
|
||||
platform: PlatformEnum
|
||||
login_type: LoginTypeEnum = LoginTypeEnum.QRCODE
|
||||
crawler_type: CrawlerTypeEnum = CrawlerTypeEnum.SEARCH
|
||||
keywords: str = "" # 搜索模式下的关键词
|
||||
specified_ids: str = "" # 详情模式下的帖子/视频ID列表,逗号分隔
|
||||
creator_ids: str = "" # 创作者模式下的创作者ID列表,逗号分隔
|
||||
keywords: str = "" # Keywords for search mode
|
||||
specified_ids: str = "" # Post/video ID list for detail mode, comma-separated
|
||||
creator_ids: str = "" # Creator ID list for creator mode, comma-separated
|
||||
start_page: int = 1
|
||||
enable_comments: bool = True
|
||||
enable_sub_comments: bool = False
|
||||
@@ -73,7 +73,7 @@ class CrawlerStartRequest(BaseModel):
|
||||
|
||||
|
||||
class CrawlerStatusResponse(BaseModel):
|
||||
"""爬虫状态响应"""
|
||||
"""Crawler status response"""
|
||||
status: Literal["idle", "running", "stopping", "error"]
|
||||
platform: Optional[str] = None
|
||||
crawler_type: Optional[str] = None
|
||||
@@ -82,7 +82,7 @@ class CrawlerStatusResponse(BaseModel):
|
||||
|
||||
|
||||
class LogEntry(BaseModel):
|
||||
"""日志条目"""
|
||||
"""Log entry"""
|
||||
id: int
|
||||
timestamp: str
|
||||
level: Literal["info", "warning", "error", "success", "debug"]
|
||||
@@ -90,7 +90,7 @@ class LogEntry(BaseModel):
|
||||
|
||||
|
||||
class DataFileInfo(BaseModel):
|
||||
"""数据文件信息"""
|
||||
"""Data file information"""
|
||||
name: str
|
||||
path: str
|
||||
size: int
|
||||
|
||||
@@ -28,7 +28,7 @@ from ..schemas import CrawlerStartRequest, LogEntry
|
||||
|
||||
|
||||
class CrawlerManager:
|
||||
"""爬虫进程管理器"""
|
||||
"""Crawler process manager"""
|
||||
|
||||
def __init__(self):
|
||||
self._lock = asyncio.Lock()
|
||||
@@ -39,9 +39,9 @@ class CrawlerManager:
|
||||
self._log_id = 0
|
||||
self._logs: List[LogEntry] = []
|
||||
self._read_task: Optional[asyncio.Task] = None
|
||||
# 项目根目录
|
||||
# Project root directory
|
||||
self._project_root = Path(__file__).parent.parent.parent
|
||||
# 日志队列 - 用于向 WebSocket 推送
|
||||
# Log queue - for pushing to WebSocket
|
||||
self._log_queue: Optional[asyncio.Queue] = None
|
||||
|
||||
@property
|
||||
@@ -49,13 +49,13 @@ class CrawlerManager:
|
||||
return self._logs
|
||||
|
||||
def get_log_queue(self) -> asyncio.Queue:
|
||||
"""获取或创建日志队列"""
|
||||
"""Get or create log queue"""
|
||||
if self._log_queue is None:
|
||||
self._log_queue = asyncio.Queue()
|
||||
return self._log_queue
|
||||
|
||||
def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
|
||||
"""创建日志条目"""
|
||||
"""Create log entry"""
|
||||
self._log_id += 1
|
||||
entry = LogEntry(
|
||||
id=self._log_id,
|
||||
@@ -64,13 +64,13 @@ class CrawlerManager:
|
||||
message=message
|
||||
)
|
||||
self._logs.append(entry)
|
||||
# 保留最近 500 条日志
|
||||
# Keep last 500 logs
|
||||
if len(self._logs) > 500:
|
||||
self._logs = self._logs[-500:]
|
||||
return entry
|
||||
|
||||
async def _push_log(self, entry: LogEntry):
|
||||
"""推送日志到队列"""
|
||||
"""Push log to queue"""
|
||||
if self._log_queue is not None:
|
||||
try:
|
||||
self._log_queue.put_nowait(entry)
|
||||
@@ -78,7 +78,7 @@ class CrawlerManager:
|
||||
pass
|
||||
|
||||
def _parse_log_level(self, line: str) -> str:
|
||||
"""解析日志级别"""
|
||||
"""Parse log level"""
|
||||
line_upper = line.upper()
|
||||
if "ERROR" in line_upper or "FAILED" in line_upper:
|
||||
return "error"
|
||||
@@ -91,16 +91,16 @@ class CrawlerManager:
|
||||
return "info"
|
||||
|
||||
async def start(self, config: CrawlerStartRequest) -> bool:
|
||||
"""启动爬虫进程"""
|
||||
"""Start crawler process"""
|
||||
async with self._lock:
|
||||
if self.process and self.process.poll() is None:
|
||||
return False
|
||||
|
||||
# 清空旧日志
|
||||
# Clear old logs
|
||||
self._logs = []
|
||||
self._log_id = 0
|
||||
|
||||
# 清空待推送队列(不要替换对象,避免 WebSocket 广播协程持有旧队列引用)
|
||||
# Clear pending queue (don't replace object to avoid WebSocket broadcast coroutine holding old queue reference)
|
||||
if self._log_queue is None:
|
||||
self._log_queue = asyncio.Queue()
|
||||
else:
|
||||
@@ -110,15 +110,15 @@ class CrawlerManager:
|
||||
except asyncio.QueueEmpty:
|
||||
pass
|
||||
|
||||
# 构建命令行参数
|
||||
# Build command line arguments
|
||||
cmd = self._build_command(config)
|
||||
|
||||
# 记录启动日志
|
||||
# Log start information
|
||||
entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
|
||||
await self._push_log(entry)
|
||||
|
||||
try:
|
||||
# 启动子进程
|
||||
# Start subprocess
|
||||
self.process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
@@ -139,7 +139,7 @@ class CrawlerManager:
|
||||
)
|
||||
await self._push_log(entry)
|
||||
|
||||
# 启动日志读取任务
|
||||
# Start log reading task
|
||||
self._read_task = asyncio.create_task(self._read_output())
|
||||
|
||||
return True
|
||||
@@ -150,7 +150,7 @@ class CrawlerManager:
|
||||
return False
|
||||
|
||||
async def stop(self) -> bool:
|
||||
"""停止爬虫进程"""
|
||||
"""Stop crawler process"""
|
||||
async with self._lock:
|
||||
if not self.process or self.process.poll() is not None:
|
||||
return False
|
||||
@@ -162,13 +162,13 @@ class CrawlerManager:
|
||||
try:
|
||||
self.process.send_signal(signal.SIGTERM)
|
||||
|
||||
# 等待优雅退出 (最多15秒)
|
||||
# Wait for graceful exit (up to 15 seconds)
|
||||
for _ in range(30):
|
||||
if self.process.poll() is not None:
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# 如果还没退出,强制杀死
|
||||
# If still not exited, force kill
|
||||
if self.process.poll() is None:
|
||||
entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
|
||||
await self._push_log(entry)
|
||||
@@ -184,7 +184,7 @@ class CrawlerManager:
|
||||
self.status = "idle"
|
||||
self.current_config = None
|
||||
|
||||
# 取消日志读取任务
|
||||
# Cancel log reading task
|
||||
if self._read_task:
|
||||
self._read_task.cancel()
|
||||
self._read_task = None
|
||||
@@ -192,7 +192,7 @@ class CrawlerManager:
|
||||
return True
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""获取当前状态"""
|
||||
"""Get current status"""
|
||||
return {
|
||||
"status": self.status,
|
||||
"platform": self.current_config.platform.value if self.current_config else None,
|
||||
@@ -202,7 +202,7 @@ class CrawlerManager:
|
||||
}
|
||||
|
||||
def _build_command(self, config: CrawlerStartRequest) -> list:
|
||||
"""构建 main.py 命令行参数"""
|
||||
"""Build main.py command line arguments"""
|
||||
cmd = ["uv", "run", "python", "main.py"]
|
||||
|
||||
cmd.extend(["--platform", config.platform.value])
|
||||
@@ -210,7 +210,7 @@ class CrawlerManager:
|
||||
cmd.extend(["--type", config.crawler_type.value])
|
||||
cmd.extend(["--save_data_option", config.save_option.value])
|
||||
|
||||
# 根据爬虫类型传递不同的参数
|
||||
# Pass different arguments based on crawler type
|
||||
if config.crawler_type.value == "search" and config.keywords:
|
||||
cmd.extend(["--keywords", config.keywords])
|
||||
elif config.crawler_type.value == "detail" and config.specified_ids:
|
||||
@@ -232,12 +232,12 @@ class CrawlerManager:
|
||||
return cmd
|
||||
|
||||
async def _read_output(self):
|
||||
"""异步读取进程输出"""
|
||||
"""Asynchronously read process output"""
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
try:
|
||||
while self.process and self.process.poll() is None:
|
||||
# 在线程池中读取一行
|
||||
# Read a line in thread pool
|
||||
line = await loop.run_in_executor(
|
||||
None, self.process.stdout.readline
|
||||
)
|
||||
@@ -248,7 +248,7 @@ class CrawlerManager:
|
||||
entry = self._create_log_entry(line, level)
|
||||
await self._push_log(entry)
|
||||
|
||||
# 读取剩余输出
|
||||
# Read remaining output
|
||||
if self.process and self.process.stdout:
|
||||
remaining = await loop.run_in_executor(
|
||||
None, self.process.stdout.read
|
||||
@@ -260,7 +260,7 @@ class CrawlerManager:
|
||||
entry = self._create_log_entry(line.strip(), level)
|
||||
await self._push_log(entry)
|
||||
|
||||
# 进程结束
|
||||
# Process ended
|
||||
if self.status == "running":
|
||||
exit_code = self.process.returncode if self.process else -1
|
||||
if exit_code == 0:
|
||||
@@ -277,5 +277,5 @@ class CrawlerManager:
|
||||
await self._push_log(entry)
|
||||
|
||||
|
||||
# 全局单例
|
||||
# Global singleton
|
||||
crawler_manager = CrawlerManager()
|
||||
|
||||
@@ -53,14 +53,14 @@ class AbstractCrawler(ABC):
|
||||
|
||||
async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器(可选实现)
|
||||
:param playwright: playwright实例
|
||||
:param playwright_proxy: playwright代理配置
|
||||
:param user_agent: 用户代理
|
||||
:param headless: 无头模式
|
||||
:return: 浏览器上下文
|
||||
Launch browser using CDP mode (optional implementation)
|
||||
:param playwright: playwright instance
|
||||
:param playwright_proxy: playwright proxy configuration
|
||||
:param user_agent: user agent
|
||||
:param headless: headless mode
|
||||
:return: browser context
|
||||
"""
|
||||
# 默认实现:回退到标准模式
|
||||
# Default implementation: fallback to standard mode
|
||||
return await self.launch_browser(playwright.chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
|
||||
|
||||
24
cache/abs_cache.py
vendored
24
cache/abs_cache.py
vendored
@@ -20,9 +20,9 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Name : Programmer AJiang-Relakkes
|
||||
# @Time : 2024/6/2 11:06
|
||||
# @Desc : 抽象类
|
||||
# @Desc : Abstract class
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Optional
|
||||
@@ -33,9 +33,9 @@ class AbstractCache(ABC):
|
||||
@abstractmethod
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
从缓存中获取键的值。
|
||||
这是一个抽象方法。子类必须实现这个方法。
|
||||
:param key: 键
|
||||
Get the value of a key from the cache.
|
||||
This is an abstract method. Subclasses must implement this method.
|
||||
:param key: The key
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -43,11 +43,11 @@ class AbstractCache(ABC):
|
||||
@abstractmethod
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中。
|
||||
这是一个抽象方法。子类必须实现这个方法。
|
||||
:param key: 键
|
||||
:param value: 值
|
||||
:param expire_time: 过期时间
|
||||
Set the value of a key in the cache.
|
||||
This is an abstract method. Subclasses must implement this method.
|
||||
:param key: The key
|
||||
:param value: The value
|
||||
:param expire_time: Expiration time
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -55,8 +55,8 @@ class AbstractCache(ABC):
|
||||
@abstractmethod
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
:param pattern: 匹配模式
|
||||
Get all keys matching the pattern
|
||||
:param pattern: Matching pattern
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
12
cache/cache_factory.py
vendored
12
cache/cache_factory.py
vendored
@@ -20,23 +20,23 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Name : Programmer AJiang-Relakkes
|
||||
# @Time : 2024/6/2 11:23
|
||||
# @Desc :
|
||||
|
||||
|
||||
class CacheFactory:
|
||||
"""
|
||||
缓存工厂类
|
||||
Cache factory class
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_cache(cache_type: str, *args, **kwargs):
|
||||
"""
|
||||
创建缓存对象
|
||||
:param cache_type: 缓存类型
|
||||
:param args: 参数
|
||||
:param kwargs: 关键字参数
|
||||
Create cache object
|
||||
:param cache_type: Cache type
|
||||
:param args: Arguments
|
||||
:param kwargs: Keyword arguments
|
||||
:return:
|
||||
"""
|
||||
if cache_type == 'memory':
|
||||
|
||||
32
cache/local_cache.py
vendored
32
cache/local_cache.py
vendored
@@ -20,9 +20,9 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Name : Programmer AJiang-Relakkes
|
||||
# @Time : 2024/6/2 11:05
|
||||
# @Desc : 本地缓存
|
||||
# @Desc : Local cache
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
@@ -35,19 +35,19 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def __init__(self, cron_interval: int = 10):
|
||||
"""
|
||||
初始化本地缓存
|
||||
:param cron_interval: 定时清楚cache的时间间隔
|
||||
Initialize local cache
|
||||
:param cron_interval: Time interval for scheduled cache cleanup
|
||||
:return:
|
||||
"""
|
||||
self._cron_interval = cron_interval
|
||||
self._cache_container: Dict[str, Tuple[Any, float]] = {}
|
||||
self._cron_task: Optional[asyncio.Task] = None
|
||||
# 开启定时清理任务
|
||||
# Start scheduled cleanup task
|
||||
self._schedule_clear()
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
析构函数,清理定时任务
|
||||
Destructor function, cleanup scheduled task
|
||||
:return:
|
||||
"""
|
||||
if self._cron_task is not None:
|
||||
@@ -55,7 +55,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
从缓存中获取键的值
|
||||
Get the value of a key from the cache
|
||||
:param key:
|
||||
:return:
|
||||
"""
|
||||
@@ -63,7 +63,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
# 如果键已过期,则删除键并返回None
|
||||
# If the key has expired, delete it and return None
|
||||
if expire_time < time.time():
|
||||
del self._cache_container[key]
|
||||
return None
|
||||
@@ -72,7 +72,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中
|
||||
Set the value of a key in the cache
|
||||
:param key:
|
||||
:param value:
|
||||
:param expire_time:
|
||||
@@ -82,14 +82,14 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
:param pattern: 匹配模式
|
||||
Get all keys matching the pattern
|
||||
:param pattern: Matching pattern
|
||||
:return:
|
||||
"""
|
||||
if pattern == '*':
|
||||
return list(self._cache_container.keys())
|
||||
|
||||
# 本地缓存通配符暂时将*替换为空
|
||||
# For local cache wildcard, temporarily replace * with empty string
|
||||
if '*' in pattern:
|
||||
pattern = pattern.replace('*', '')
|
||||
|
||||
@@ -97,7 +97,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def _schedule_clear(self):
|
||||
"""
|
||||
开启定时清理任务,
|
||||
Start scheduled cleanup task
|
||||
:return:
|
||||
"""
|
||||
|
||||
@@ -111,7 +111,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def _clear(self):
|
||||
"""
|
||||
根据过期时间清理缓存
|
||||
Clean up cache based on expiration time
|
||||
:return:
|
||||
"""
|
||||
for key, (value, expire_time) in self._cache_container.items():
|
||||
@@ -120,7 +120,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
async def _start_clear_cron(self):
|
||||
"""
|
||||
开启定时清理任务
|
||||
Start scheduled cleanup task
|
||||
:return:
|
||||
"""
|
||||
while True:
|
||||
@@ -130,7 +130,7 @@ class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
if __name__ == '__main__':
|
||||
cache = ExpiringLocalCache(cron_interval=2)
|
||||
cache.set('name', '程序员阿江-Relakkes', 3)
|
||||
cache.set('name', 'Programmer AJiang-Relakkes', 3)
|
||||
print(cache.get('key'))
|
||||
print(cache.keys("*"))
|
||||
time.sleep(4)
|
||||
|
||||
16
cache/redis_cache.py
vendored
16
cache/redis_cache.py
vendored
@@ -20,9 +20,9 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Name : Programmer AJiang-Relakkes
|
||||
# @Time : 2024/5/29 22:57
|
||||
# @Desc : RedisCache实现
|
||||
# @Desc : RedisCache implementation
|
||||
import pickle
|
||||
import time
|
||||
from typing import Any, List
|
||||
@@ -36,13 +36,13 @@ from config import db_config
|
||||
class RedisCache(AbstractCache):
|
||||
|
||||
def __init__(self) -> None:
|
||||
# 连接redis, 返回redis客户端
|
||||
# Connect to redis, return redis client
|
||||
self._redis_client = self._connet_redis()
|
||||
|
||||
@staticmethod
|
||||
def _connet_redis() -> Redis:
|
||||
"""
|
||||
连接redis, 返回redis客户端, 这里按需配置redis连接信息
|
||||
Connect to redis, return redis client, configure redis connection information as needed
|
||||
:return:
|
||||
"""
|
||||
return Redis(
|
||||
@@ -54,7 +54,7 @@ class RedisCache(AbstractCache):
|
||||
|
||||
def get(self, key: str) -> Any:
|
||||
"""
|
||||
从缓存中获取键的值, 并且反序列化
|
||||
Get the value of a key from the cache and deserialize it
|
||||
:param key:
|
||||
:return:
|
||||
"""
|
||||
@@ -65,7 +65,7 @@ class RedisCache(AbstractCache):
|
||||
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中, 并且序列化
|
||||
Set the value of a key in the cache and serialize it
|
||||
:param key:
|
||||
:param value:
|
||||
:param expire_time:
|
||||
@@ -75,7 +75,7 @@ class RedisCache(AbstractCache):
|
||||
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
Get all keys matching the pattern
|
||||
"""
|
||||
return [key.decode() for key in self._redis_client.keys(pattern)]
|
||||
|
||||
@@ -83,7 +83,7 @@ class RedisCache(AbstractCache):
|
||||
if __name__ == '__main__':
|
||||
redis_cache = RedisCache()
|
||||
# basic usage
|
||||
redis_cache.set("name", "程序员阿江-Relakkes", 1)
|
||||
redis_cache.set("name", "Programmer AJiang-Relakkes", 1)
|
||||
print(redis_cache.get("name")) # Relakkes
|
||||
print(redis_cache.keys("*")) # ['name']
|
||||
time.sleep(2)
|
||||
|
||||
@@ -37,7 +37,7 @@ EnumT = TypeVar("EnumT", bound=Enum)
|
||||
|
||||
|
||||
class PlatformEnum(str, Enum):
|
||||
"""支持的媒体平台枚举"""
|
||||
"""Supported media platform enumeration"""
|
||||
|
||||
XHS = "xhs"
|
||||
DOUYIN = "dy"
|
||||
@@ -49,7 +49,7 @@ class PlatformEnum(str, Enum):
|
||||
|
||||
|
||||
class LoginTypeEnum(str, Enum):
|
||||
"""登录方式枚举"""
|
||||
"""Login type enumeration"""
|
||||
|
||||
QRCODE = "qrcode"
|
||||
PHONE = "phone"
|
||||
@@ -57,7 +57,7 @@ class LoginTypeEnum(str, Enum):
|
||||
|
||||
|
||||
class CrawlerTypeEnum(str, Enum):
|
||||
"""爬虫类型枚举"""
|
||||
"""Crawler type enumeration"""
|
||||
|
||||
SEARCH = "search"
|
||||
DETAIL = "detail"
|
||||
@@ -65,7 +65,7 @@ class CrawlerTypeEnum(str, Enum):
|
||||
|
||||
|
||||
class SaveDataOptionEnum(str, Enum):
|
||||
"""数据保存方式枚举"""
|
||||
"""Data save option enumeration"""
|
||||
|
||||
CSV = "csv"
|
||||
DB = "db"
|
||||
@@ -76,7 +76,7 @@ class SaveDataOptionEnum(str, Enum):
|
||||
|
||||
|
||||
class InitDbOptionEnum(str, Enum):
|
||||
"""数据库初始化选项"""
|
||||
"""Database initialization option"""
|
||||
|
||||
SQLITE = "sqlite"
|
||||
MYSQL = "mysql"
|
||||
@@ -102,7 +102,7 @@ def _coerce_enum(
|
||||
return enum_cls(value)
|
||||
except ValueError:
|
||||
typer.secho(
|
||||
f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内,已回退到默认值 '{default.value}'.",
|
||||
f"⚠️ Config value '{value}' is not within the supported range of {enum_cls.__name__}, falling back to default value '{default.value}'.",
|
||||
fg=typer.colors.YELLOW,
|
||||
)
|
||||
return default
|
||||
@@ -133,7 +133,7 @@ def _inject_init_db_default(args: Sequence[str]) -> list[str]:
|
||||
|
||||
|
||||
async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
"""使用 Typer 解析命令行参数。"""
|
||||
"""Parse command line arguments using Typer."""
|
||||
|
||||
app = typer.Typer(add_completion=False)
|
||||
|
||||
@@ -143,48 +143,48 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
PlatformEnum,
|
||||
typer.Option(
|
||||
"--platform",
|
||||
help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)",
|
||||
rich_help_panel="基础配置",
|
||||
help="Media platform selection (xhs=XiaoHongShu | dy=Douyin | ks=Kuaishou | bili=Bilibili | wb=Weibo | tieba=Baidu Tieba | zhihu=Zhihu)",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS),
|
||||
lt: Annotated[
|
||||
LoginTypeEnum,
|
||||
typer.Option(
|
||||
"--lt",
|
||||
help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)",
|
||||
rich_help_panel="账号配置",
|
||||
help="Login type (qrcode=QR Code | phone=Phone | cookie=Cookie)",
|
||||
rich_help_panel="Account Configuration",
|
||||
),
|
||||
] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE),
|
||||
crawler_type: Annotated[
|
||||
CrawlerTypeEnum,
|
||||
typer.Option(
|
||||
"--type",
|
||||
help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)",
|
||||
rich_help_panel="基础配置",
|
||||
help="Crawler type (search=Search | detail=Detail | creator=Creator)",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH),
|
||||
start: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--start",
|
||||
help="起始页码",
|
||||
rich_help_panel="基础配置",
|
||||
help="Starting page number",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = config.START_PAGE,
|
||||
keywords: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--keywords",
|
||||
help="请输入关键词,多个关键词用逗号分隔",
|
||||
rich_help_panel="基础配置",
|
||||
help="Enter keywords, multiple keywords separated by commas",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = config.KEYWORDS,
|
||||
get_comment: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--get_comment",
|
||||
help="是否爬取一级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
|
||||
rich_help_panel="评论配置",
|
||||
help="Whether to crawl first-level comments, supports yes/true/t/y/1 or no/false/f/n/0",
|
||||
rich_help_panel="Comment Configuration",
|
||||
show_default=True,
|
||||
),
|
||||
] = str(config.ENABLE_GET_COMMENTS),
|
||||
@@ -192,8 +192,8 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
str,
|
||||
typer.Option(
|
||||
"--get_sub_comment",
|
||||
help="是否爬取二级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
|
||||
rich_help_panel="评论配置",
|
||||
help="Whether to crawl second-level comments, supports yes/true/t/y/1 or no/false/f/n/0",
|
||||
rich_help_panel="Comment Configuration",
|
||||
show_default=True,
|
||||
),
|
||||
] = str(config.ENABLE_GET_SUB_COMMENTS),
|
||||
@@ -201,8 +201,8 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
str,
|
||||
typer.Option(
|
||||
"--headless",
|
||||
help="是否启用无头模式(对 Playwright 和 CDP 均生效),支持 yes/true/t/y/1 或 no/false/f/n/0",
|
||||
rich_help_panel="运行配置",
|
||||
help="Whether to enable headless mode (applies to both Playwright and CDP), supports yes/true/t/y/1 or no/false/f/n/0",
|
||||
rich_help_panel="Runtime Configuration",
|
||||
show_default=True,
|
||||
),
|
||||
] = str(config.HEADLESS),
|
||||
@@ -210,8 +210,8 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
SaveDataOptionEnum,
|
||||
typer.Option(
|
||||
"--save_data_option",
|
||||
help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库 | mongodb=MongoDB数据库 | excel=Excel文件)",
|
||||
rich_help_panel="存储配置",
|
||||
help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file)",
|
||||
rich_help_panel="Storage Configuration",
|
||||
),
|
||||
] = _coerce_enum(
|
||||
SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
|
||||
@@ -220,32 +220,32 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
Optional[InitDbOptionEnum],
|
||||
typer.Option(
|
||||
"--init_db",
|
||||
help="初始化数据库表结构 (sqlite | mysql)",
|
||||
rich_help_panel="存储配置",
|
||||
help="Initialize database table structure (sqlite | mysql)",
|
||||
rich_help_panel="Storage Configuration",
|
||||
),
|
||||
] = None,
|
||||
cookies: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--cookies",
|
||||
help="Cookie 登录方式使用的 Cookie 值",
|
||||
rich_help_panel="账号配置",
|
||||
help="Cookie value used for Cookie login method",
|
||||
rich_help_panel="Account Configuration",
|
||||
),
|
||||
] = config.COOKIES,
|
||||
specified_id: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--specified_id",
|
||||
help="详情模式下的帖子/视频ID列表,多个ID用逗号分隔(支持完整URL或ID)",
|
||||
rich_help_panel="基础配置",
|
||||
help="Post/video ID list in detail mode, multiple IDs separated by commas (supports full URL or ID)",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = "",
|
||||
creator_id: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--creator_id",
|
||||
help="创作者模式下的创作者ID列表,多个ID用逗号分隔(支持完整URL或ID)",
|
||||
rich_help_panel="基础配置",
|
||||
help="Creator ID list in creator mode, multiple IDs separated by commas (supports full URL or ID)",
|
||||
rich_help_panel="Basic Configuration",
|
||||
),
|
||||
] = "",
|
||||
) -> SimpleNamespace:
|
||||
|
||||
@@ -17,9 +17,9 @@
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# persist-1<persist1@126.com>
|
||||
# 原因:将 db.py 改造为模块,移除直接执行入口,修复相对导入问题。
|
||||
# 副作用:无
|
||||
# 回滚策略:还原此文件。
|
||||
# Reason: Refactored db.py into a module, removed direct execution entry point, fixed relative import issues.
|
||||
# Side effects: None
|
||||
# Rollback strategy: Restore this file.
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
@@ -406,9 +406,9 @@ class ZhihuContent(Base):
|
||||
last_modify_ts = Column(BigInteger)
|
||||
|
||||
# persist-1<persist1@126.com>
|
||||
# 原因:修复 ORM 模型定义错误,确保与数据库表结构一致。
|
||||
# 副作用:无
|
||||
# 回滚策略:还原此行
|
||||
# Reason: Fixed ORM model definition error, ensuring consistency with database table structure.
|
||||
# Side effects: None
|
||||
# Rollback strategy: Restore this line
|
||||
|
||||
class ZhihuComment(Base):
|
||||
__tablename__ = 'zhihu_comment'
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
"""MongoDB存储基类:提供连接管理和通用存储方法"""
|
||||
"""MongoDB storage base class: Provides connection management and common storage methods"""
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional
|
||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||
@@ -25,7 +25,7 @@ from tools import utils
|
||||
|
||||
|
||||
class MongoDBConnection:
|
||||
"""MongoDB连接管理(单例模式)"""
|
||||
"""MongoDB connection management (singleton pattern)"""
|
||||
_instance = None
|
||||
_client: Optional[AsyncIOMotorClient] = None
|
||||
_db: Optional[AsyncIOMotorDatabase] = None
|
||||
@@ -37,7 +37,7 @@ class MongoDBConnection:
|
||||
return cls._instance
|
||||
|
||||
async def get_client(self) -> AsyncIOMotorClient:
|
||||
"""获取客户端"""
|
||||
"""Get client"""
|
||||
if self._client is None:
|
||||
async with self._lock:
|
||||
if self._client is None:
|
||||
@@ -45,7 +45,7 @@ class MongoDBConnection:
|
||||
return self._client
|
||||
|
||||
async def get_db(self) -> AsyncIOMotorDatabase:
|
||||
"""获取数据库"""
|
||||
"""Get database"""
|
||||
if self._db is None:
|
||||
async with self._lock:
|
||||
if self._db is None:
|
||||
@@ -53,7 +53,7 @@ class MongoDBConnection:
|
||||
return self._db
|
||||
|
||||
async def _connect(self):
|
||||
"""建立连接"""
|
||||
"""Establish connection"""
|
||||
try:
|
||||
mongo_config = db_config.mongodb_config
|
||||
host = mongo_config["host"]
|
||||
@@ -62,14 +62,14 @@ class MongoDBConnection:
|
||||
password = mongo_config["password"]
|
||||
db_name = mongo_config["db_name"]
|
||||
|
||||
# 构建连接URL(有认证/无认证)
|
||||
# Build connection URL (with/without authentication)
|
||||
if user and password:
|
||||
connection_url = f"mongodb://{user}:{password}@{host}:{port}/"
|
||||
else:
|
||||
connection_url = f"mongodb://{host}:{port}/"
|
||||
|
||||
self._client = AsyncIOMotorClient(connection_url, serverSelectionTimeoutMS=5000)
|
||||
await self._client.server_info() # 测试连接
|
||||
await self._client.server_info() # Test connection
|
||||
self._db = self._client[db_name]
|
||||
utils.logger.info(f"[MongoDBConnection] Connected to {host}:{port}/{db_name}")
|
||||
except Exception as e:
|
||||
@@ -77,7 +77,7 @@ class MongoDBConnection:
|
||||
raise
|
||||
|
||||
async def close(self):
|
||||
"""关闭连接"""
|
||||
"""Close connection"""
|
||||
if self._client is not None:
|
||||
self._client.close()
|
||||
self._client = None
|
||||
@@ -86,24 +86,24 @@ class MongoDBConnection:
|
||||
|
||||
|
||||
class MongoDBStoreBase:
|
||||
"""MongoDB存储基类:提供通用的CRUD操作"""
|
||||
"""MongoDB storage base class: Provides common CRUD operations"""
|
||||
|
||||
def __init__(self, collection_prefix: str):
|
||||
"""初始化存储基类
|
||||
"""Initialize storage base class
|
||||
Args:
|
||||
collection_prefix: 平台前缀(xhs/douyin/bilibili等)
|
||||
collection_prefix: Platform prefix (xhs/douyin/bilibili, etc.)
|
||||
"""
|
||||
self.collection_prefix = collection_prefix
|
||||
self._connection = MongoDBConnection()
|
||||
|
||||
async def get_collection(self, collection_suffix: str) -> AsyncIOMotorCollection:
|
||||
"""获取集合:{prefix}_{suffix}"""
|
||||
"""Get collection: {prefix}_{suffix}"""
|
||||
db = await self._connection.get_db()
|
||||
collection_name = f"{self.collection_prefix}_{collection_suffix}"
|
||||
return db[collection_name]
|
||||
|
||||
async def save_or_update(self, collection_suffix: str, query: Dict, data: Dict) -> bool:
|
||||
"""保存或更新数据(upsert)"""
|
||||
"""Save or update data (upsert)"""
|
||||
try:
|
||||
collection = await self.get_collection(collection_suffix)
|
||||
await collection.update_one(query, {"$set": data}, upsert=True)
|
||||
@@ -113,7 +113,7 @@ class MongoDBStoreBase:
|
||||
return False
|
||||
|
||||
async def find_one(self, collection_suffix: str, query: Dict) -> Optional[Dict]:
|
||||
"""查询单条数据"""
|
||||
"""Query a single record"""
|
||||
try:
|
||||
collection = await self.get_collection(collection_suffix)
|
||||
return await collection.find_one(query)
|
||||
@@ -122,7 +122,7 @@ class MongoDBStoreBase:
|
||||
return None
|
||||
|
||||
async def find_many(self, collection_suffix: str, query: Dict, limit: int = 0) -> List[Dict]:
|
||||
"""查询多条数据(limit=0表示不限制)"""
|
||||
"""Query multiple records (limit=0 means no limit)"""
|
||||
try:
|
||||
collection = await self.get_collection(collection_suffix)
|
||||
cursor = collection.find(query)
|
||||
@@ -134,7 +134,7 @@ class MongoDBStoreBase:
|
||||
return []
|
||||
|
||||
async def create_index(self, collection_suffix: str, keys: List[tuple], unique: bool = False):
|
||||
"""创建索引:keys=[("field", 1)]"""
|
||||
"""Create index: keys=[("field", 1)]"""
|
||||
try:
|
||||
collection = await self.get_collection(collection_suffix)
|
||||
await collection.create_index(keys, unique=unique)
|
||||
|
||||
4
main.py
4
main.py
@@ -114,7 +114,7 @@ async def async_cleanup() -> None:
|
||||
except Exception as e:
|
||||
error_msg = str(e).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
print(f"[Main] 清理CDP浏览器时出错: {e}")
|
||||
print(f"[Main] Error cleaning up CDP browser: {e}")
|
||||
|
||||
elif getattr(crawler, "browser_context", None):
|
||||
try:
|
||||
@@ -122,7 +122,7 @@ async def async_cleanup() -> None:
|
||||
except Exception as e:
|
||||
error_msg = str(e).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
print(f"[Main] 关闭浏览器上下文时出错: {e}")
|
||||
print(f"[Main] Error closing browser context: {e}")
|
||||
|
||||
if config.SAVE_DATA_OPTION in ("db", "sqlite"):
|
||||
await db.close()
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibili 请求客户端
|
||||
# @Desc : bilibili request client
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
@@ -47,7 +47,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间
|
||||
timeout=60, # For media crawling, Bilibili long videos need a longer timeout
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -61,11 +61,11 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self._host = "https://api.bilibili.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy has expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -82,8 +82,8 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def pre_request_data(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
发送请求进行请求参数签名
|
||||
需要从 localStorage 拿 wbi_img_urls 这参数,值如下:
|
||||
Send request to sign request parameters
|
||||
Need to get wbi_img_urls parameter from localStorage, value as follows:
|
||||
https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png
|
||||
:param req_data:
|
||||
:return:
|
||||
@@ -95,7 +95,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_wbi_keys(self) -> Tuple[str, str]:
|
||||
"""
|
||||
获取最新的 img_key 和 sub_key
|
||||
Get the latest img_key and sub_key
|
||||
:return:
|
||||
"""
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
@@ -160,12 +160,12 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
KuaiShou web search api
|
||||
:param keyword: 搜索关键词
|
||||
:param page: 分页参数具体第几页
|
||||
:param page_size: 每一页参数的数量
|
||||
:param order: 搜索结果排序,默认位综合排序
|
||||
:param pubtime_begin_s: 发布时间开始时间戳
|
||||
:param pubtime_end_s: 发布时间结束时间戳
|
||||
:param keyword: Search keyword
|
||||
:param page: Page number for pagination
|
||||
:param page_size: Number of items per page
|
||||
:param order: Sort order for search results, default is comprehensive sorting
|
||||
:param pubtime_begin_s: Publish time start timestamp
|
||||
:param pubtime_end_s: Publish time end timestamp
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/web-interface/wbi/search/type"
|
||||
@@ -182,13 +182,13 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
|
||||
"""
|
||||
Bilibli web video detail api, aid 和 bvid任选一个参数
|
||||
:param aid: 稿件avid
|
||||
:param bvid: 稿件bvid
|
||||
Bilibli web video detail api, choose one parameter between aid and bvid
|
||||
:param aid: Video aid
|
||||
:param bvid: Video bvid
|
||||
:return:
|
||||
"""
|
||||
if not aid and not bvid:
|
||||
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
|
||||
raise ValueError("Please provide at least one parameter: aid or bvid")
|
||||
|
||||
uri = "/x/web-interface/view/detail"
|
||||
params = dict()
|
||||
@@ -201,12 +201,12 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_video_play_url(self, aid: int, cid: int) -> Dict:
|
||||
"""
|
||||
Bilibli web video play url api
|
||||
:param aid: 稿件avid
|
||||
:param aid: Video aid
|
||||
:param cid: cid
|
||||
:return:
|
||||
"""
|
||||
if not aid or not cid or aid <= 0 or cid <= 0:
|
||||
raise ValueError("aid 和 cid 必须存在")
|
||||
raise ValueError("aid and cid must exist")
|
||||
uri = "/x/player/wbi/playurl"
|
||||
qn_value = getattr(config, "BILI_QN", 80)
|
||||
params = {
|
||||
@@ -233,7 +233,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
)
|
||||
return None
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # Keep original exception type name for developer debugging
|
||||
return None
|
||||
|
||||
async def get_video_comments(
|
||||
@@ -243,9 +243,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
next: int = 0,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
:param video_id: 视频 ID
|
||||
:param order_mode: 排序方式
|
||||
:param next: 评论页选择
|
||||
:param video_id: Video ID
|
||||
:param order_mode: Sort order
|
||||
:param next: Comment page selection
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/v2/reply/wbi/main"
|
||||
@@ -266,7 +266,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
max_count: Maximum number of comments to crawl per note
|
||||
|
||||
:return:
|
||||
"""
|
||||
@@ -299,7 +299,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||
|
||||
# 检查 is_end 和 next 是否存在
|
||||
# Check if is_end and next exist
|
||||
if "is_end" not in cursor_info or "next" not in cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
|
||||
is_end = True
|
||||
@@ -317,7 +317,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
{await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)}
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
@@ -336,10 +336,10 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
get video all level two comments for a level one comment
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param video_id: Video ID
|
||||
:param level_one_comment_id: Level one comment ID
|
||||
:param order_mode:
|
||||
:param ps: 一页评论数
|
||||
:param ps: Number of comments per page
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:return:
|
||||
@@ -349,7 +349,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
while True:
|
||||
result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode)
|
||||
comment_list: List[Dict] = result.get("replies", [])
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if (int(result["page"]["count"]) <= pn * ps):
|
||||
@@ -366,9 +366,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
order_mode: CommentOrderType,
|
||||
) -> Dict:
|
||||
"""get video level two comments
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param order_mode: 排序方式
|
||||
:param video_id: Video ID
|
||||
:param level_one_comment_id: Level one comment ID
|
||||
:param order_mode: Sort order
|
||||
|
||||
:return:
|
||||
"""
|
||||
@@ -386,10 +386,10 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
"""get all videos for a creator
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 页数
|
||||
:param ps: 一页视频数
|
||||
:param order_mode: 排序方式
|
||||
:param creator_id: Creator ID
|
||||
:param pn: Page number
|
||||
:param ps: Number of videos per page
|
||||
:param order_mode: Sort order
|
||||
|
||||
:return:
|
||||
"""
|
||||
@@ -405,7 +405,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_creator_info(self, creator_id: int) -> Dict:
|
||||
"""
|
||||
get creator info
|
||||
:param creator_id: 作者 ID
|
||||
:param creator_id: Creator ID
|
||||
"""
|
||||
uri = "/x/space/wbi/acc/info"
|
||||
post_data = {
|
||||
@@ -421,9 +421,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator fans
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:param creator_id: Creator ID
|
||||
:param pn: Start page number
|
||||
:param ps: Number of items per page
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/fans"
|
||||
@@ -443,9 +443,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator followings
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:param creator_id: Creator ID
|
||||
:param pn: Start page number
|
||||
:param ps: Number of items per page
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/followings"
|
||||
@@ -460,8 +460,8 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
|
||||
"""
|
||||
get creator comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param offset: 发送请求所需参数
|
||||
:param creator_id: Creator ID
|
||||
:param offset: Parameter required for sending request
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/polymer/web-dynamic/v1/feed/space"
|
||||
@@ -485,9 +485,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大粉丝数量
|
||||
:param max_count: Maximum number of fans to crawl for a creator
|
||||
|
||||
:return: up主粉丝数列表
|
||||
:return: List of creator fans
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
@@ -499,7 +499,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
pn += 1
|
||||
if len(result) + len(fans_list) > max_count:
|
||||
fans_list = fans_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(creator_info, fans_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not fans_list:
|
||||
@@ -519,9 +519,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大关注者数量
|
||||
:param max_count: Maximum number of followings to crawl for a creator
|
||||
|
||||
:return: up主关注者列表
|
||||
:return: List of creator followings
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
@@ -533,7 +533,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
pn += 1
|
||||
if len(result) + len(followings_list) > max_count:
|
||||
followings_list = followings_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute it
|
||||
await callback(creator_info, followings_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not followings_list:
|
||||
@@ -553,9 +553,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大动态数量
|
||||
:param max_count: Maximum number of dynamics to crawl for a creator
|
||||
|
||||
:return: up主关注者列表
|
||||
:return: List of creator dynamics
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : B站爬虫
|
||||
# @Desc : Bilibili Crawler
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
@@ -64,7 +64,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -74,9 +74,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[BilibiliCrawler] Launching browser using CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -84,7 +84,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[BilibiliCrawler] Launching browser using standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
|
||||
@@ -149,31 +149,31 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
end: str = config.END_DAY,
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
|
||||
Get bilibili publish start timestamp pubtime_begin_s and publish end timestamp pubtime_end_s
|
||||
---
|
||||
:param start: 发布日期起始时间,YYYY-MM-DD
|
||||
:param end: 发布日期结束时间,YYYY-MM-DD
|
||||
:param start: Publish date start time, YYYY-MM-DD
|
||||
:param end: Publish date end time, YYYY-MM-DD
|
||||
|
||||
Note
|
||||
---
|
||||
- 搜索的时间范围为 start 至 end,包含 start 和 end
|
||||
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
|
||||
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
|
||||
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
- Search time range is from start to end, including both start and end
|
||||
- To search content from the same day, to include search content from that day, pubtime_end_s should be pubtime_begin_s plus one day minus one second, i.e., the last second of start day
|
||||
- For example, searching only 2024-01-05 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704470399
|
||||
Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- To search content from start to end, to include search content from end day, pubtime_end_s should be pubtime_end_s plus one day minus one second, i.e., the last second of end day
|
||||
- For example, searching 2024-01-05 - 2024-01-06 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704556799
|
||||
Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
"""
|
||||
# 转换 start 与 end 为 datetime 对象
|
||||
# Convert start and end to datetime objects
|
||||
start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
|
||||
end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
|
||||
if start_day > end_day:
|
||||
raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end")
|
||||
elif start_day == end_day: # 搜索同一天的内容
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
else: # 搜索 start 至 end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||
# 将其重新转换为时间戳
|
||||
elif start_day == end_day: # Searching content from the same day
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to start_day + 1 day - 1 second
|
||||
else: # Searching from start to end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to end_day + 1 day - 1 second
|
||||
# Convert back to timestamps
|
||||
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||
|
||||
async def search_by_keywords(self):
|
||||
@@ -203,8 +203,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
||||
pubtime_end_s=0, # 作品发布日期结束日期时间戳
|
||||
pubtime_begin_s=0, # Publish date start timestamp
|
||||
pubtime_end_s=0, # Publish date end timestamp
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
@@ -508,7 +508,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's stable Chrome version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -525,7 +525,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -536,22 +536,22 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP mode launch failed, fallback to standard mode: {e}")
|
||||
# Fallback to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# If using CDP mode, special handling is required
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -27,28 +27,28 @@ from enum import Enum
|
||||
|
||||
|
||||
class SearchOrderType(Enum):
|
||||
# 综合排序
|
||||
# Comprehensive sorting
|
||||
DEFAULT = ""
|
||||
|
||||
# 最多点击
|
||||
# Most clicks
|
||||
MOST_CLICK = "click"
|
||||
|
||||
# 最新发布
|
||||
# Latest published
|
||||
LAST_PUBLISH = "pubdate"
|
||||
|
||||
# 最多弹幕
|
||||
# Most danmu (comments)
|
||||
MOST_DANMU = "dm"
|
||||
|
||||
# 最多收藏
|
||||
# Most bookmarks
|
||||
MOST_MARK = "stow"
|
||||
|
||||
|
||||
class CommentOrderType(Enum):
|
||||
# 仅按热度
|
||||
# By popularity only
|
||||
DEFAULT = 0
|
||||
|
||||
# 按热度+按时间
|
||||
# By popularity + time
|
||||
MIXED = 1
|
||||
|
||||
# 按时间
|
||||
# By time
|
||||
TIME = 2
|
||||
|
||||
@@ -21,8 +21,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 23:26
|
||||
# @Desc : bilibili 请求参数签名
|
||||
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
# @Desc : bilibili request parameter signing
|
||||
# Reverse engineering implementation reference: https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
import re
|
||||
import urllib.parse
|
||||
from hashlib import md5
|
||||
@@ -45,7 +45,7 @@ class BilibiliSign:
|
||||
|
||||
def get_salt(self) -> str:
|
||||
"""
|
||||
获取加盐的 key
|
||||
Get the salted key
|
||||
:return:
|
||||
"""
|
||||
salt = ""
|
||||
@@ -56,8 +56,8 @@ class BilibiliSign:
|
||||
|
||||
def sign(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
请求参数中加上当前时间戳对请求参数中的key进行字典序排序
|
||||
再将请求参数进行 url 编码集合 salt 进行 md5 就可以生成w_rid参数了
|
||||
Add current timestamp to request parameters, sort keys in dictionary order,
|
||||
then URL encode the parameters and combine with salt to generate md5 for w_rid parameter
|
||||
:param req_data:
|
||||
:return:
|
||||
"""
|
||||
@@ -65,35 +65,35 @@ class BilibiliSign:
|
||||
req_data.update({"wts": current_ts})
|
||||
req_data = dict(sorted(req_data.items()))
|
||||
req_data = {
|
||||
# 过滤 value 中的 "!'()*" 字符
|
||||
# Filter "!'()*" characters from values
|
||||
k: ''.join(filter(lambda ch: ch not in "!'()*", str(v)))
|
||||
for k, v
|
||||
in req_data.items()
|
||||
}
|
||||
query = urllib.parse.urlencode(req_data)
|
||||
salt = self.get_salt()
|
||||
wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid
|
||||
wbi_sign = md5((query + salt).encode()).hexdigest() # Calculate w_rid
|
||||
req_data['w_rid'] = wbi_sign
|
||||
return req_data
|
||||
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从B站视频URL中解析出视频ID
|
||||
Parse video ID from Bilibili video URL
|
||||
Args:
|
||||
url: B站视频链接
|
||||
url: Bilibili video link
|
||||
- https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
|
||||
- https://www.bilibili.com/video/BV1d54y1g7db
|
||||
- BV1d54y1g7db (直接传入BV号)
|
||||
- BV1d54y1g7db (directly pass BV number)
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
VideoUrlInfo: Object containing video ID
|
||||
"""
|
||||
# 如果传入的已经是BV号,直接返回
|
||||
# If the input is already a BV number, return directly
|
||||
if url.startswith("BV"):
|
||||
return VideoUrlInfo(video_id=url)
|
||||
|
||||
# 使用正则表达式提取BV号
|
||||
# 匹配 /video/BV... 或 /video/av... 格式
|
||||
# Use regex to extract BV number
|
||||
# Match /video/BV... or /video/av... format
|
||||
bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
|
||||
match = re.search(bv_pattern, url)
|
||||
|
||||
@@ -101,26 +101,26 @@ def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
video_id = match.group(1)
|
||||
return VideoUrlInfo(video_id=video_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
raise ValueError(f"Unable to parse video ID from URL: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从B站创作者空间URL中解析出创作者ID
|
||||
Parse creator ID from Bilibili creator space URL
|
||||
Args:
|
||||
url: B站创作者空间链接
|
||||
url: Bilibili creator space link
|
||||
- https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
|
||||
- https://space.bilibili.com/20813884
|
||||
- 434377496 (直接传入UID)
|
||||
- 434377496 (directly pass UID)
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
CreatorUrlInfo: Object containing creator ID
|
||||
"""
|
||||
# 如果传入的已经是纯数字ID,直接返回
|
||||
# If the input is already a numeric ID, return directly
|
||||
if url.isdigit():
|
||||
return CreatorUrlInfo(creator_id=url)
|
||||
|
||||
# 使用正则表达式提取UID
|
||||
# 匹配 /space.bilibili.com/数字 格式
|
||||
# Use regex to extract UID
|
||||
# Match /space.bilibili.com/number format
|
||||
uid_pattern = r'space\.bilibili\.com/(\d+)'
|
||||
match = re.search(uid_pattern, url)
|
||||
|
||||
@@ -128,20 +128,20 @@ def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
creator_id = match.group(1)
|
||||
return CreatorUrlInfo(creator_id=creator_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
raise ValueError(f"Unable to parse creator ID from URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试视频URL解析
|
||||
# Test video URL parsing
|
||||
video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
|
||||
video_url2 = "BV1d54y1g7db"
|
||||
print("视频URL解析测试:")
|
||||
print("Video URL parsing test:")
|
||||
print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
|
||||
print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
|
||||
|
||||
# 测试创作者URL解析
|
||||
# Test creator URL parsing
|
||||
creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
|
||||
creator_url2 = "20813884"
|
||||
print("\n创作者URL解析测试:")
|
||||
print("\nCreator URL parsing test:")
|
||||
print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
|
||||
print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibli登录实现类
|
||||
# @Desc : bilibili login implementation class
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
|
||||
@@ -23,21 +23,21 @@ from enum import Enum
|
||||
|
||||
class SearchChannelType(Enum):
|
||||
"""search channel type"""
|
||||
GENERAL = "aweme_general" # 综合
|
||||
VIDEO = "aweme_video_web" # 视频
|
||||
USER = "aweme_user_web" # 用户
|
||||
LIVE = "aweme_live" # 直播
|
||||
GENERAL = "aweme_general" # General
|
||||
VIDEO = "aweme_video_web" # Video
|
||||
USER = "aweme_user_web" # User
|
||||
LIVE = "aweme_live" # Live
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
GENERAL = 0 # 综合排序
|
||||
MOST_LIKE = 1 # 最多点赞
|
||||
LATEST = 2 # 最新发布
|
||||
GENERAL = 0 # Comprehensive sorting
|
||||
MOST_LIKE = 1 # Most likes
|
||||
LATEST = 2 # Latest published
|
||||
|
||||
class PublishTimeType(Enum):
|
||||
"""publish time type"""
|
||||
UNLIMITED = 0 # 不限
|
||||
ONE_DAY = 1 # 一天内
|
||||
ONE_WEEK = 7 # 一周内
|
||||
SIX_MONTH = 180 # 半年内
|
||||
UNLIMITED = 0 # Unlimited
|
||||
ONE_DAY = 1 # Within one day
|
||||
ONE_WEEK = 7 # Within one week
|
||||
SIX_MONTH = 180 # Within six months
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||
# @Desc : Get a_bogus parameter, for learning and communication only, do not use for commercial purposes, contact author to delete if infringement
|
||||
|
||||
import random
|
||||
import re
|
||||
@@ -38,7 +38,7 @@ douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').re
|
||||
|
||||
def get_web_id():
|
||||
"""
|
||||
生成随机的webid
|
||||
Generate random webid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -60,13 +60,13 @@ def get_web_id():
|
||||
|
||||
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||
"""
|
||||
获取 a_bogus 参数, 目前不支持post请求类型的签名
|
||||
Get a_bogus parameter, currently does not support POST request type signature
|
||||
"""
|
||||
return get_a_bogus_from_js(url, params, user_agent)
|
||||
|
||||
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
"""
|
||||
通过js获取 a_bogus 参数
|
||||
Get a_bogus parameter through js
|
||||
Args:
|
||||
url:
|
||||
params:
|
||||
@@ -84,8 +84,8 @@ def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
|
||||
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||
"""
|
||||
通过playright获取 a_bogus 参数
|
||||
playwright版本已失效
|
||||
Get a_bogus parameter through playwright
|
||||
playwright version is deprecated
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -100,73 +100,73 @@ async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: s
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从抖音视频URL中解析出视频ID
|
||||
支持以下格式:
|
||||
1. 普通视频链接: https://www.douyin.com/video/7525082444551310602
|
||||
2. 带modal_id参数的链接:
|
||||
Parse video ID from Douyin video URL
|
||||
Supports the following formats:
|
||||
1. Normal video link: https://www.douyin.com/video/7525082444551310602
|
||||
2. Link with modal_id parameter:
|
||||
- https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?modal_id=7525082444551310602
|
||||
- https://www.douyin.com/root/search/python?modal_id=7471165520058862848
|
||||
3. 短链接: https://v.douyin.com/iF12345ABC/ (需要client解析)
|
||||
4. 纯ID: 7525082444551310602
|
||||
3. Short link: https://v.douyin.com/iF12345ABC/ (requires client parsing)
|
||||
4. Pure ID: 7525082444551310602
|
||||
|
||||
Args:
|
||||
url: 抖音视频链接或ID
|
||||
url: Douyin video link or ID
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
VideoUrlInfo: Object containing video ID
|
||||
"""
|
||||
# 如果是纯数字ID,直接返回
|
||||
# If it's a pure numeric ID, return directly
|
||||
if url.isdigit():
|
||||
return VideoUrlInfo(aweme_id=url, url_type="normal")
|
||||
|
||||
# 检查是否是短链接 (v.douyin.com)
|
||||
# Check if it's a short link (v.douyin.com)
|
||||
if "v.douyin.com" in url or url.startswith("http") and len(url) < 50 and "video" not in url:
|
||||
return VideoUrlInfo(aweme_id="", url_type="short") # 需要通过client解析
|
||||
return VideoUrlInfo(aweme_id="", url_type="short") # Requires client parsing
|
||||
|
||||
# 尝试从URL参数中提取modal_id
|
||||
# Try to extract modal_id from URL parameters
|
||||
params = extract_url_params_to_dict(url)
|
||||
modal_id = params.get("modal_id")
|
||||
if modal_id:
|
||||
return VideoUrlInfo(aweme_id=modal_id, url_type="modal")
|
||||
|
||||
# 从标准视频URL中提取ID: /video/数字
|
||||
# Extract ID from standard video URL: /video/number
|
||||
video_pattern = r'/video/(\d+)'
|
||||
match = re.search(video_pattern, url)
|
||||
if match:
|
||||
aweme_id = match.group(1)
|
||||
return VideoUrlInfo(aweme_id=aweme_id, url_type="normal")
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
raise ValueError(f"Unable to parse video ID from URL: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从抖音创作者主页URL中解析出创作者ID (sec_user_id)
|
||||
支持以下格式:
|
||||
1. 创作者主页: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main
|
||||
2. 纯ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE
|
||||
Parse creator ID (sec_user_id) from Douyin creator homepage URL
|
||||
Supports the following formats:
|
||||
1. Creator homepage: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main
|
||||
2. Pure ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE
|
||||
|
||||
Args:
|
||||
url: 抖音创作者主页链接或sec_user_id
|
||||
url: Douyin creator homepage link or sec_user_id
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
CreatorUrlInfo: Object containing creator ID
|
||||
"""
|
||||
# 如果是纯ID格式(通常以MS4wLjABAAAA开头),直接返回
|
||||
# If it's a pure ID format (usually starts with MS4wLjABAAAA), return directly
|
||||
if url.startswith("MS4wLjABAAAA") or (not url.startswith("http") and "douyin.com" not in url):
|
||||
return CreatorUrlInfo(sec_user_id=url)
|
||||
|
||||
# 从创作者主页URL中提取sec_user_id: /user/xxx
|
||||
# Extract sec_user_id from creator homepage URL: /user/xxx
|
||||
user_pattern = r'/user/([^/?]+)'
|
||||
match = re.search(user_pattern, url)
|
||||
if match:
|
||||
sec_user_id = match.group(1)
|
||||
return CreatorUrlInfo(sec_user_id=sec_user_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
raise ValueError(f"Unable to parse creator ID from URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试视频URL解析
|
||||
print("=== 视频URL解析测试 ===")
|
||||
# Test video URL parsing
|
||||
print("=== Video URL Parsing Test ===")
|
||||
test_urls = [
|
||||
"https://www.douyin.com/video/7525082444551310602",
|
||||
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525082444551310602",
|
||||
@@ -177,13 +177,13 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_video_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
# 测试创作者URL解析
|
||||
print("=== 创作者URL解析测试 ===")
|
||||
# Test creator URL parsing
|
||||
print("=== Creator URL Parsing Test ===")
|
||||
test_creator_urls = [
|
||||
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
|
||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||
@@ -192,7 +192,7 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_creator_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
@@ -53,7 +53,7 @@ class DouYinLogin(AbstractLogin):
|
||||
async def begin(self):
|
||||
"""
|
||||
Start login douyin website
|
||||
滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录
|
||||
The verification accuracy of the slider verification is not very good... If there are no special requirements, it is recommended not to use Douyin login, or use cookie login
|
||||
"""
|
||||
|
||||
# popup login dialog
|
||||
@@ -69,7 +69,7 @@ class DouYinLogin(AbstractLogin):
|
||||
else:
|
||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
|
||||
# If the page redirects to the slider verification page, need to slide again
|
||||
await asyncio.sleep(6)
|
||||
current_page_title = await self.context_page.title()
|
||||
if "验证码中间页" in current_page_title:
|
||||
@@ -147,10 +147,10 @@ class DouYinLogin(AbstractLogin):
|
||||
send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']")
|
||||
await send_sms_code_btn.click()
|
||||
|
||||
# 检查是否有滑动验证码
|
||||
# Check if there is slider verification
|
||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
max_get_sms_code_time = 60 * 2 # Maximum time to get verification code is 2 minutes
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
@@ -164,20 +164,20 @@ class DouYinLogin(AbstractLogin):
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||
await asyncio.sleep(0.5)
|
||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
await submit_btn_ele.click() # Click login
|
||||
# todo ... should also check the correctness of the verification code, it may be incorrect
|
||||
break
|
||||
|
||||
async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"):
|
||||
"""
|
||||
检查页面是否出现滑动验证码
|
||||
Check if slider verification appears on the page
|
||||
:return:
|
||||
"""
|
||||
# 等待滑动验证码的出现
|
||||
# Wait for slider verification to appear
|
||||
back_selector = "#captcha-verify-image"
|
||||
try:
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000)
|
||||
except PlaywrightTimeoutError: # 没有滑动验证码,直接返回
|
||||
except PlaywrightTimeoutError: # No slider verification, return directly
|
||||
return
|
||||
|
||||
gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]'
|
||||
@@ -191,16 +191,16 @@ class DouYinLogin(AbstractLogin):
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
|
||||
# If the slider is too slow or verification failed, it will prompt "操作过慢", click the refresh button here
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
|
||||
continue
|
||||
|
||||
# 滑动成功后,等待滑块消失
|
||||
# After successful sliding, wait for the slider to disappear
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
|
||||
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
|
||||
# If the slider disappears, it means the verification is successful, break the loop. If not, it means the verification failed, the above line will throw an exception and be caught to continue the loop
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
|
||||
slider_verify_success = True
|
||||
except Exception as e:
|
||||
@@ -213,10 +213,10 @@ class DouYinLogin(AbstractLogin):
|
||||
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
|
||||
"""
|
||||
Move the slider to the right to complete the verification
|
||||
:param back_selector: 滑动验证码背景图片的选择器
|
||||
:param gap_selector: 滑动验证码的滑块选择器
|
||||
:param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
:param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块
|
||||
:param back_selector: Selector for the slider verification background image
|
||||
:param gap_selector: Selector for the slider verification slider
|
||||
:param move_step: Controls the ratio of single movement speed, default is 1, meaning the distance moves in 0.1 seconds no matter how far, larger value means slower
|
||||
:param slider_level: Slider difficulty easy hard, corresponding to the slider for mobile verification code and the slider in the middle of verification code
|
||||
:return:
|
||||
"""
|
||||
|
||||
@@ -234,31 +234,31 @@ class DouYinLogin(AbstractLogin):
|
||||
)
|
||||
gap_src = str(await gap_elements.get_property("src")) # type: ignore
|
||||
|
||||
# 识别滑块位置
|
||||
# Identify slider position
|
||||
slide_app = utils.Slide(gap=gap_src, bg=slide_back)
|
||||
distance = slide_app.discern()
|
||||
|
||||
# 获取移动轨迹
|
||||
# Get movement trajectory
|
||||
tracks = utils.get_tracks(distance, slider_level)
|
||||
new_1 = tracks[-1] - (sum(tracks) - distance)
|
||||
tracks.pop()
|
||||
tracks.append(new_1)
|
||||
|
||||
# 根据轨迹拖拽滑块到指定位置
|
||||
# Drag slider to specified position according to trajectory
|
||||
element = await self.context_page.query_selector(gap_selector)
|
||||
bounding_box = await element.bounding_box() # type: ignore
|
||||
|
||||
await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore
|
||||
bounding_box["y"] + bounding_box["height"] / 2) # type: ignore
|
||||
# 这里获取到x坐标中心点位置
|
||||
# Get x coordinate center position
|
||||
x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore
|
||||
# 模拟滑动操作
|
||||
# Simulate sliding operation
|
||||
await element.hover() # type: ignore
|
||||
await self.context_page.mouse.down()
|
||||
|
||||
for track in tracks:
|
||||
# 循环鼠标按照轨迹移动
|
||||
# steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
# Loop mouse movement according to trajectory
|
||||
# steps controls the ratio of single movement speed, default is 1, meaning the distance moves in 0.1 seconds no matter how far, larger value means slower
|
||||
await self.context_page.mouse.move(x + track, 0, steps=move_step)
|
||||
x += track
|
||||
await self.context_page.mouse.up()
|
||||
|
||||
@@ -57,11 +57,11 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self.graphql = KuaiShouGraphQL()
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -222,7 +222,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
comments = vision_commen_list.get("rootComments", [])
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[: max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If there is a callback function, execute the callback function
|
||||
await callback(photo_id, comments)
|
||||
result.extend(comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
@@ -240,12 +240,12 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Get all second-level comments under specified first-level comments, this method will continue to find all second-level comment information under first-level comments
|
||||
Args:
|
||||
comments: 评论列表
|
||||
photo_id: 视频id
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
comments: Comment list
|
||||
photo_id: Video ID
|
||||
crawl_interval: Delay unit for crawling comments once (seconds)
|
||||
callback: Callback after one comment crawl ends
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -285,7 +285,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
async def get_creator_info(self, user_id: str) -> Dict:
|
||||
"""
|
||||
eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
|
||||
快手用户主页
|
||||
Kuaishou user homepage
|
||||
"""
|
||||
|
||||
visionProfile = await self.get_creator_profile(user_id)
|
||||
@@ -298,11 +298,11 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Get all posts published by the specified user, this method will continue to find all post information under a user
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
user_id: User ID
|
||||
crawl_interval: Delay unit for crawling once (seconds)
|
||||
callback: Update callback function after one page crawl ends
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
@@ -58,7 +58,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
self.index_url = "https://www.kuaishou.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool, used for automatic proxy refresh
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -72,9 +72,9 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Select startup mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[KuaishouCrawler] Launching browser using CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -82,7 +82,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[KuaishouCrawler] Launching browser using standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
@@ -318,7 +318,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return ks_client_obj
|
||||
|
||||
@@ -344,7 +344,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's stable Chrome version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -362,7 +362,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -373,17 +373,17 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[KuaishouCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler] CDP模式启动失败,回退到标准模式: {e}"
|
||||
f"[KuaishouCrawler] CDP mode launch failed, fallback to standard mode: {e}"
|
||||
)
|
||||
# 回退到标准模式
|
||||
# Fallback to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
@@ -438,7 +438,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# If using CDP mode, need special handling
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -18,8 +18,8 @@
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# 快手的数据传输是基于GraphQL实现的
|
||||
# 这个类负责获取一些GraphQL的schema
|
||||
# Kuaishou's data transmission is based on GraphQL
|
||||
# This class is responsible for obtaining some GraphQL schemas
|
||||
from typing import Dict
|
||||
|
||||
|
||||
|
||||
@@ -26,59 +26,59 @@ from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从快手视频URL中解析出视频ID
|
||||
支持以下格式:
|
||||
1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
|
||||
2. 纯视频ID: "3x3zxz4mjrsc8ke"
|
||||
Parse video ID from Kuaishou video URL
|
||||
Supports the following formats:
|
||||
1. Full video URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
|
||||
2. Pure video ID: "3x3zxz4mjrsc8ke"
|
||||
|
||||
Args:
|
||||
url: 快手视频链接或视频ID
|
||||
url: Kuaishou video link or video ID
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
VideoUrlInfo: Object containing video ID
|
||||
"""
|
||||
# 如果不包含http且不包含kuaishou.com,认为是纯ID
|
||||
# If it doesn't contain http and doesn't contain kuaishou.com, consider it as pure ID
|
||||
if not url.startswith("http") and "kuaishou.com" not in url:
|
||||
return VideoUrlInfo(video_id=url, url_type="normal")
|
||||
|
||||
# 从标准视频URL中提取ID: /short-video/视频ID
|
||||
# Extract ID from standard video URL: /short-video/video_ID
|
||||
video_pattern = r'/short-video/([a-zA-Z0-9_-]+)'
|
||||
match = re.search(video_pattern, url)
|
||||
if match:
|
||||
video_id = match.group(1)
|
||||
return VideoUrlInfo(video_id=video_id, url_type="normal")
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
raise ValueError(f"Unable to parse video ID from URL: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从快手创作者主页URL中解析出创作者ID
|
||||
支持以下格式:
|
||||
1. 创作者主页: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
|
||||
2. 纯ID: "3x4sm73aye7jq7i"
|
||||
Parse creator ID from Kuaishou creator homepage URL
|
||||
Supports the following formats:
|
||||
1. Creator homepage: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
|
||||
2. Pure ID: "3x4sm73aye7jq7i"
|
||||
|
||||
Args:
|
||||
url: 快手创作者主页链接或user_id
|
||||
url: Kuaishou creator homepage link or user_id
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
CreatorUrlInfo: Object containing creator ID
|
||||
"""
|
||||
# 如果不包含http且不包含kuaishou.com,认为是纯ID
|
||||
# If it doesn't contain http and doesn't contain kuaishou.com, consider it as pure ID
|
||||
if not url.startswith("http") and "kuaishou.com" not in url:
|
||||
return CreatorUrlInfo(user_id=url)
|
||||
|
||||
# 从创作者主页URL中提取user_id: /profile/xxx
|
||||
# Extract user_id from creator homepage URL: /profile/xxx
|
||||
user_pattern = r'/profile/([a-zA-Z0-9_-]+)'
|
||||
match = re.search(user_pattern, url)
|
||||
if match:
|
||||
user_id = match.group(1)
|
||||
return CreatorUrlInfo(user_id=user_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
raise ValueError(f"Unable to parse creator ID from URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试视频URL解析
|
||||
print("=== 视频URL解析测试 ===")
|
||||
# Test video URL parsing
|
||||
print("=== Video URL Parsing Test ===")
|
||||
test_video_urls = [
|
||||
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
|
||||
"3xf8enb8dbj6uig",
|
||||
@@ -87,13 +87,13 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_video_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
# 测试创作者URL解析
|
||||
print("=== 创作者URL解析测试 ===")
|
||||
# Test creator URL parsing
|
||||
print("=== Creator URL Parsing Test ===")
|
||||
test_creator_urls = [
|
||||
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
|
||||
"3x4sm73aye7jq7i",
|
||||
@@ -102,7 +102,7 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_creator_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
@@ -48,7 +48,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
):
|
||||
self.ip_pool: Optional[ProxyIpPool] = ip_pool
|
||||
self.timeout = timeout
|
||||
# 使用传入的headers(包含真实浏览器UA)或默认headers
|
||||
# Use provided headers (including real browser UA) or default headers
|
||||
self.headers = headers or {
|
||||
"User-Agent": utils.get_user_agent(),
|
||||
"Cookie": "",
|
||||
@@ -56,21 +56,21 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
self.playwright_page = playwright_page # Playwright页面对象
|
||||
self.playwright_page = playwright_page # Playwright page object
|
||||
|
||||
def _sync_request(self, method, url, proxy=None, **kwargs):
|
||||
"""
|
||||
同步的requests请求方法
|
||||
Synchronous requests method
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
proxy: 代理IP
|
||||
**kwargs: 其他请求参数
|
||||
method: Request method
|
||||
url: Request URL
|
||||
proxy: Proxy IP
|
||||
**kwargs: Other request parameters
|
||||
|
||||
Returns:
|
||||
response对象
|
||||
Response object
|
||||
"""
|
||||
# 构造代理字典
|
||||
# Construct proxy dictionary
|
||||
proxies = None
|
||||
if proxy:
|
||||
proxies = {
|
||||
@@ -78,7 +78,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
"https": proxy,
|
||||
}
|
||||
|
||||
# 发送请求
|
||||
# Send request
|
||||
response = requests.request(
|
||||
method=method,
|
||||
url=url,
|
||||
@@ -91,7 +91,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
async def _refresh_proxy_if_expired(self) -> None:
|
||||
"""
|
||||
检测代理是否过期,如果过期则自动刷新
|
||||
Check if proxy is expired and automatically refresh if necessary
|
||||
"""
|
||||
if self.ip_pool is None:
|
||||
return
|
||||
@@ -101,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
"[BaiduTieBaClient._refresh_proxy_if_expired] Proxy expired, refreshing..."
|
||||
)
|
||||
new_proxy = await self.ip_pool.get_or_refresh_proxy()
|
||||
# 更新代理URL
|
||||
# Update proxy URL
|
||||
_, self.default_ip_proxy = utils.format_proxy_info(new_proxy)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient._refresh_proxy_if_expired] New proxy: {new_proxy.ip}:{new_proxy.port}"
|
||||
@@ -110,23 +110,23 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装requests的公共请求方法,对请求响应做一些处理
|
||||
Common request method wrapper for requests, handles request responses
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
return_ori_content: 是否返回原始内容
|
||||
proxy: 代理IP
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
method: Request method
|
||||
url: Request URL
|
||||
return_ori_content: Whether to return original content
|
||||
proxy: Proxy IP
|
||||
**kwargs: Other request parameters, such as headers, request body, etc.
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
actual_proxy = proxy if proxy else self.default_ip_proxy
|
||||
|
||||
# 在线程池中执行同步的requests请求
|
||||
# Execute synchronous requests in thread pool
|
||||
response = await asyncio.to_thread(
|
||||
self._sync_request,
|
||||
method,
|
||||
@@ -151,11 +151,11 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
GET request with header signing
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
return_ori_content: 是否返回原始内容
|
||||
uri: Request route
|
||||
params: Request parameters
|
||||
return_ori_content: Whether to return original content
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -175,15 +175,15 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self.default_ip_proxy = proxy
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] Reached maximum retry attempts, IP is blocked, please try a new IP proxy: {e}")
|
||||
raise Exception(f"[BaiduTieBaClient.get] Reached maximum retry attempts, IP is blocked, please try a new IP proxy: {e}")
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
POST request with header signing
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
uri: Request route
|
||||
data: Request body parameters
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -193,13 +193,13 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
async def pong(self, browser_context: BrowserContext = None) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
使用Cookie检测而非API调用,避免被检测
|
||||
Check if login state is still valid
|
||||
Uses Cookie detection instead of API calls to avoid detection
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
browser_context: Browser context object
|
||||
|
||||
Returns:
|
||||
bool: True表示已登录,False表示未登录
|
||||
bool: True if logged in, False if not logged in
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaClient.pong] Begin to check tieba login state by cookies...")
|
||||
|
||||
@@ -208,13 +208,13 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
return False
|
||||
|
||||
try:
|
||||
# 从浏览器获取cookies并检查关键登录cookie
|
||||
# Get cookies from browser and check key login cookies
|
||||
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
|
||||
# 百度贴吧的登录标识: STOKEN 或 PTOKEN
|
||||
# Baidu Tieba login identifiers: STOKEN or PTOKEN
|
||||
stoken = cookie_dict.get("STOKEN")
|
||||
ptoken = cookie_dict.get("PTOKEN")
|
||||
bduss = cookie_dict.get("BDUSS") # 百度通用登录cookie
|
||||
bduss = cookie_dict.get("BDUSS") # Baidu universal login cookie
|
||||
|
||||
if stoken or ptoken or bduss:
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] Login state verified by cookies (STOKEN: {bool(stoken)}, PTOKEN: {bool(ptoken)}, BDUSS: {bool(bduss)})")
|
||||
@@ -229,9 +229,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Update cookies method provided by API client, usually called after successful login
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
browser_context: Browser context object
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -249,13 +249,13 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据关键词搜索贴吧帖子 (使用Playwright访问页面,避免API检测)
|
||||
Search Tieba posts by keyword (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 分页第几页
|
||||
page_size: 每页大小
|
||||
sort: 结果排序方式
|
||||
note_type: 帖子类型(主题贴|主题+回复混合模式)
|
||||
keyword: Keyword
|
||||
page: Page number
|
||||
page_size: Page size
|
||||
sort: Result sort method
|
||||
note_type: Post type (main thread | main thread + reply mixed mode)
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -263,8 +263,8 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based search")
|
||||
|
||||
# 构造搜索URL
|
||||
# 示例: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=编程
|
||||
# Construct search URL
|
||||
# Example: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=keyword
|
||||
search_url = f"{self._host}/f/search/res"
|
||||
params = {
|
||||
"ie": "utf-8",
|
||||
@@ -275,64 +275,64 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
"only_thread": note_type.value,
|
||||
}
|
||||
|
||||
# 拼接完整URL
|
||||
# Concatenate full URL
|
||||
full_url = f"{search_url}?{urlencode(params)}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 访问搜索页面: {full_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Accessing search page: {full_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问搜索页面
|
||||
# Use Playwright to access search page
|
||||
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面HTML内容
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 成功获取搜索页面HTML,长度: {len(page_content)}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Successfully retrieved search page HTML, length: {len(page_content)}")
|
||||
|
||||
# 提取搜索结果
|
||||
# Extract search results
|
||||
notes = self._page_extractor.extract_search_note_list(page_content)
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 提取到 {len(notes)} 条帖子")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Extracted {len(notes)} posts")
|
||||
return notes
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] 搜索失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] Search failed: {e}")
|
||||
raise
|
||||
|
||||
async def get_note_by_id(self, note_id: str) -> TiebaNote:
|
||||
"""
|
||||
根据帖子ID获取帖子详情 (使用Playwright访问页面,避免API检测)
|
||||
Get post details by post ID (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
note_id: 帖子ID
|
||||
note_id: Post ID
|
||||
|
||||
Returns:
|
||||
TiebaNote: 帖子详情对象
|
||||
TiebaNote: Post detail object
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based note detail fetching")
|
||||
|
||||
# 构造帖子详情URL
|
||||
# Construct post detail URL
|
||||
note_url = f"{self._host}/p/{note_id}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 访问帖子详情页面: {note_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Accessing post detail page: {note_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问帖子详情页面
|
||||
# Use Playwright to access post detail page
|
||||
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面HTML内容
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 成功获取帖子详情HTML,长度: {len(page_content)}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Successfully retrieved post detail HTML, length: {len(page_content)}")
|
||||
|
||||
# 提取帖子详情
|
||||
# Extract post details
|
||||
note_detail = self._page_extractor.extract_note_detail(page_content)
|
||||
return note_detail
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] 获取帖子详情失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] Failed to get post details: {e}")
|
||||
raise
|
||||
|
||||
async def get_note_all_comments(
|
||||
@@ -343,14 +343,14 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
max_count: int = 10,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论 (使用Playwright访问页面,避免API检测)
|
||||
Get all first-level comments for specified post (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
note_detail: 帖子详情对象
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后的回调函数
|
||||
max_count: 一次帖子爬取的最大评论数量
|
||||
note_detail: Post detail object
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback function after one post crawl completes
|
||||
max_count: Maximum number of comments to crawl per post
|
||||
Returns:
|
||||
List[TiebaComment]: 评论列表
|
||||
List[TiebaComment]: Comment list
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_note_all_comments] playwright_page is None, cannot use browser mode")
|
||||
@@ -360,30 +360,30 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
current_page = 1
|
||||
|
||||
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||
# 构造评论页URL
|
||||
# Construct comment page URL
|
||||
comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 访问评论页面: {comment_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Accessing comment page: {comment_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问评论页面
|
||||
# Use Playwright to access comment page
|
||||
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面HTML内容
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
|
||||
# 提取评论
|
||||
# Extract comments
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(
|
||||
page_content, note_id=note_detail.note_id
|
||||
)
|
||||
|
||||
if not comments:
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Page {current_page} has no comments, stopping crawl")
|
||||
break
|
||||
|
||||
# 限制评论数量
|
||||
# Limit comment count
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
|
||||
@@ -392,7 +392,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
result.extend(comments)
|
||||
|
||||
# 获取所有子评论
|
||||
# Get all sub-comments
|
||||
await self.get_comments_all_sub_comments(
|
||||
comments, crawl_interval=crawl_interval, callback=callback
|
||||
)
|
||||
@@ -401,10 +401,10 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
current_page += 1
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] 获取第{current_page}页评论失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] Failed to get page {current_page} comments: {e}")
|
||||
break
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 共获取 {len(result)} 条一级评论")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Total retrieved {len(result)} first-level comments")
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
@@ -414,14 +414,14 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论 (使用Playwright访问页面,避免API检测)
|
||||
Get all sub-comments for specified comments (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后的回调函数
|
||||
comments: Comment list
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback function after one post crawl completes
|
||||
|
||||
Returns:
|
||||
List[TiebaComment]: 子评论列表
|
||||
List[TiebaComment]: Sub-comment list
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
@@ -440,7 +440,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
|
||||
while max_sub_page_num >= current_page:
|
||||
# 构造子评论URL
|
||||
# Construct sub-comment URL
|
||||
sub_comment_url = (
|
||||
f"{self._host}/p/comment?"
|
||||
f"tid={parment_comment.note_id}&"
|
||||
@@ -448,19 +448,19 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
f"fid={parment_comment.tieba_id}&"
|
||||
f"pn={current_page}"
|
||||
)
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 访问子评论页面: {sub_comment_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] Accessing sub-comment page: {sub_comment_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问子评论页面
|
||||
# Use Playwright to access sub-comment page
|
||||
await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面HTML内容
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
|
||||
# 提取子评论
|
||||
# Extract sub-comments
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(
|
||||
page_content, parent_comment=parment_comment
|
||||
)
|
||||
@@ -468,7 +468,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
if not sub_comments:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_comments_all_sub_comments] "
|
||||
f"评论{parment_comment.comment_id}第{current_page}页没有子评论,停止爬取"
|
||||
f"Comment {parment_comment.comment_id} page {current_page} has no sub-comments, stopping crawl"
|
||||
)
|
||||
break
|
||||
|
||||
@@ -482,125 +482,125 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaClient.get_comments_all_sub_comments] "
|
||||
f"获取评论{parment_comment.comment_id}第{current_page}页子评论失败: {e}"
|
||||
f"Failed to get comment {parment_comment.comment_id} page {current_page} sub-comments: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 共获取 {len(all_sub_comments)} 条子评论")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] Total retrieved {len(all_sub_comments)} sub-comments")
|
||||
return all_sub_comments
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表 (使用Playwright访问页面,避免API检测)
|
||||
Get post list by Tieba name (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
tieba_name: 贴吧名称
|
||||
page_num: 分页页码
|
||||
tieba_name: Tieba name
|
||||
page_num: Page number
|
||||
|
||||
Returns:
|
||||
List[TiebaNote]: 帖子列表
|
||||
List[TiebaNote]: Post list
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based tieba note fetching")
|
||||
|
||||
# 构造贴吧帖子列表URL
|
||||
# Construct Tieba post list URL
|
||||
tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 访问贴吧页面: {tieba_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Accessing Tieba page: {tieba_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问贴吧页面
|
||||
# Use Playwright to access Tieba page
|
||||
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面HTML内容
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 成功获取贴吧页面HTML,长度: {len(page_content)}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Successfully retrieved Tieba page HTML, length: {len(page_content)}")
|
||||
|
||||
# 提取帖子列表
|
||||
# Extract post list
|
||||
notes = self._page_extractor.extract_tieba_note_list(page_content)
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 提取到 {len(notes)} 条帖子")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Extracted {len(notes)} posts")
|
||||
return notes
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] 获取贴吧帖子列表失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] Failed to get Tieba post list: {e}")
|
||||
raise
|
||||
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> str:
|
||||
"""
|
||||
根据创作者URL获取创作者信息 (使用Playwright访问页面,避免API检测)
|
||||
Get creator information by creator URL (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
creator_url: 创作者主页URL
|
||||
creator_url: Creator homepage URL
|
||||
|
||||
Returns:
|
||||
str: 页面HTML内容
|
||||
str: Page HTML content
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based creator info fetching")
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 访问创作者主页: {creator_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Accessing creator homepage: {creator_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问创作者主页
|
||||
# Use Playwright to access creator homepage
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面HTML内容
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 成功获取创作者主页HTML,长度: {len(page_content)}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Successfully retrieved creator homepage HTML, length: {len(page_content)}")
|
||||
|
||||
return page_content
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] 获取创作者主页失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] Failed to get creator homepage: {e}")
|
||||
raise
|
||||
|
||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||
"""
|
||||
根据创作者获取创作者的帖子 (使用Playwright访问页面,避免API检测)
|
||||
Get creator's posts by creator (uses Playwright to access page, avoiding API detection)
|
||||
Args:
|
||||
user_name: 创作者用户名
|
||||
page_number: 页码
|
||||
user_name: Creator username
|
||||
page_number: Page number
|
||||
|
||||
Returns:
|
||||
Dict: 包含帖子数据的字典
|
||||
Dict: Dictionary containing post data
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_notes_by_creator] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based creator notes fetching")
|
||||
|
||||
# 构造创作者帖子列表URL
|
||||
# Construct creator post list URL
|
||||
creator_url = f"{self._host}/home/get/getthread?un={quote(user_name)}&pn={page_number}&id=utf-8&_={utils.get_current_timestamp()}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 访问创作者帖子列表: {creator_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] Accessing creator post list: {creator_url}")
|
||||
|
||||
try:
|
||||
# 使用Playwright访问创作者帖子列表页面
|
||||
# Use Playwright to access creator post list page
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# 获取页面内容(这个接口返回JSON)
|
||||
# Get page content (this API returns JSON)
|
||||
page_content = await self.playwright_page.content()
|
||||
|
||||
# 提取JSON数据(页面会包含<pre>标签或直接是JSON)
|
||||
# Extract JSON data (page will contain <pre> tag or is directly JSON)
|
||||
try:
|
||||
# 尝试从页面中提取JSON
|
||||
# Try to extract JSON from page
|
||||
json_text = await self.playwright_page.evaluate("() => document.body.innerText")
|
||||
result = json.loads(json_text)
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 成功获取创作者帖子数据")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] Successfully retrieved creator post data")
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON解析失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 页面内容: {page_content[:500]}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON parsing failed: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] Page content: {page_content[:500]}")
|
||||
raise Exception(f"Failed to parse JSON from creator notes page: {e}")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 获取创作者帖子列表失败: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] Failed to get creator post list: {e}")
|
||||
raise
|
||||
|
||||
async def get_all_notes_by_creator_user_name(
|
||||
@@ -612,18 +612,18 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
creator_page_html_content: str = None,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据创作者用户名获取创作者所有帖子
|
||||
Get all creator posts by creator username
|
||||
Args:
|
||||
user_name: 创作者用户名
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
|
||||
max_note_count: 帖子最大获取数量,如果为0则获取所有
|
||||
creator_page_html_content: 创作者主页HTML内容
|
||||
user_name: Creator username
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback function after one post crawl completes, an awaitable function
|
||||
max_note_count: Maximum number of posts to retrieve, if 0 then get all
|
||||
creator_page_html_content: Creator homepage HTML content
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到
|
||||
# Baidu Tieba is special, the first 10 posts are directly displayed on the homepage and need special handling, cannot be obtained through API
|
||||
result: List[TiebaNote] = []
|
||||
if creator_page_html_content:
|
||||
thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content))
|
||||
|
||||
@@ -79,9 +79,9 @@ class TieBaCrawler(AbstractCrawler):
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose startup mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[BaiduTieBaCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[BaiduTieBaCrawler] Launching browser in CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -89,7 +89,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[BaiduTieBaCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[BaiduTieBaCrawler] Launching browser in standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
@@ -99,12 +99,12 @@ class TieBaCrawler(AbstractCrawler):
|
||||
headless=config.HEADLESS,
|
||||
)
|
||||
|
||||
# 注入反检测脚本 - 针对百度的特殊检测
|
||||
# Inject anti-detection scripts - for Baidu's special detection
|
||||
await self._inject_anti_detection_scripts()
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
|
||||
# 先访问百度首页,再点击贴吧链接,避免触发安全验证
|
||||
# First visit Baidu homepage, then click Tieba link to avoid triggering security verification
|
||||
await self._navigate_to_tieba_via_baidu()
|
||||
|
||||
# Create a client to interact with the baidutieba website.
|
||||
@@ -399,29 +399,29 @@ class TieBaCrawler(AbstractCrawler):
|
||||
|
||||
async def _navigate_to_tieba_via_baidu(self):
|
||||
"""
|
||||
模拟真实用户访问路径:
|
||||
1. 先访问百度首页 (https://www.baidu.com/)
|
||||
2. 等待页面加载
|
||||
3. 点击顶部导航栏的"贴吧"链接
|
||||
4. 跳转到贴吧首页
|
||||
Simulate real user access path:
|
||||
1. First visit Baidu homepage (https://www.baidu.com/)
|
||||
2. Wait for page to load
|
||||
3. Click "Tieba" link in top navigation bar
|
||||
4. Jump to Tieba homepage
|
||||
|
||||
这样做可以避免触发百度的安全验证
|
||||
This avoids triggering Baidu's security verification
|
||||
"""
|
||||
utils.logger.info("[TieBaCrawler] 模拟真实用户访问路径...")
|
||||
utils.logger.info("[TieBaCrawler] Simulating real user access path...")
|
||||
|
||||
try:
|
||||
# Step 1: 访问百度首页
|
||||
utils.logger.info("[TieBaCrawler] Step 1: 访问百度首页 https://www.baidu.com/")
|
||||
# Step 1: Visit Baidu homepage
|
||||
utils.logger.info("[TieBaCrawler] Step 1: Visiting Baidu homepage https://www.baidu.com/")
|
||||
await self.context_page.goto("https://www.baidu.com/", wait_until="domcontentloaded")
|
||||
|
||||
# Step 2: 等待页面加载,使用配置文件中的延时设置
|
||||
utils.logger.info(f"[TieBaCrawler] Step 2: 等待 {config.CRAWLER_MAX_SLEEP_SEC}秒 模拟用户浏览...")
|
||||
# Step 2: Wait for page loading, using delay setting from config file
|
||||
utils.logger.info(f"[TieBaCrawler] Step 2: Waiting {config.CRAWLER_MAX_SLEEP_SEC} seconds to simulate user browsing...")
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# Step 3: 查找并点击"贴吧"链接
|
||||
utils.logger.info("[TieBaCrawler] Step 3: 查找并点击'贴吧'链接...")
|
||||
# Step 3: Find and click "Tieba" link
|
||||
utils.logger.info("[TieBaCrawler] Step 3: Finding and clicking 'Tieba' link...")
|
||||
|
||||
# 尝试多种选择器,确保能找到贴吧链接
|
||||
# Try multiple selectors to ensure finding the Tieba link
|
||||
tieba_selectors = [
|
||||
'a[href="http://tieba.baidu.com/"]',
|
||||
'a[href="https://tieba.baidu.com/"]',
|
||||
@@ -434,74 +434,74 @@ class TieBaCrawler(AbstractCrawler):
|
||||
try:
|
||||
tieba_link = await self.context_page.wait_for_selector(selector, timeout=5000)
|
||||
if tieba_link:
|
||||
utils.logger.info(f"[TieBaCrawler] 找到贴吧链接 (selector: {selector})")
|
||||
utils.logger.info(f"[TieBaCrawler] Found Tieba link (selector: {selector})")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not tieba_link:
|
||||
utils.logger.warning("[TieBaCrawler] 未找到贴吧链接,直接访问贴吧首页")
|
||||
utils.logger.warning("[TieBaCrawler] Tieba link not found, directly accessing Tieba homepage")
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
|
||||
return
|
||||
|
||||
# Step 4: 点击贴吧链接 (检查是否会打开新标签页)
|
||||
utils.logger.info("[TieBaCrawler] Step 4: 点击贴吧链接...")
|
||||
# Step 4: Click Tieba link (check if it will open in a new tab)
|
||||
utils.logger.info("[TieBaCrawler] Step 4: Clicking Tieba link...")
|
||||
|
||||
# 检查链接的target属性
|
||||
# Check link's target attribute
|
||||
target_attr = await tieba_link.get_attribute("target")
|
||||
utils.logger.info(f"[TieBaCrawler] 链接target属性: {target_attr}")
|
||||
utils.logger.info(f"[TieBaCrawler] Link target attribute: {target_attr}")
|
||||
|
||||
if target_attr == "_blank":
|
||||
# 如果是新标签页,需要等待新页面并切换
|
||||
utils.logger.info("[TieBaCrawler] 链接会在新标签页打开,等待新页面...")
|
||||
# If it's a new tab, need to wait for new page and switch
|
||||
utils.logger.info("[TieBaCrawler] Link will open in new tab, waiting for new page...")
|
||||
|
||||
async with self.browser_context.expect_page() as new_page_info:
|
||||
await tieba_link.click()
|
||||
|
||||
# 获取新打开的页面
|
||||
# Get newly opened page
|
||||
new_page = await new_page_info.value
|
||||
await new_page.wait_for_load_state("domcontentloaded")
|
||||
|
||||
# 关闭旧的百度首页
|
||||
# Close old Baidu homepage
|
||||
await self.context_page.close()
|
||||
|
||||
# 切换到新的贴吧页面
|
||||
# Switch to new Tieba page
|
||||
self.context_page = new_page
|
||||
utils.logger.info("[TieBaCrawler] ✅ 已切换到新标签页 (贴吧页面)")
|
||||
utils.logger.info("[TieBaCrawler] Successfully switched to new tab (Tieba page)")
|
||||
else:
|
||||
# 如果是同一标签页跳转,正常等待导航
|
||||
utils.logger.info("[TieBaCrawler] 链接在当前标签页跳转...")
|
||||
# If it's same tab navigation, wait for navigation normally
|
||||
utils.logger.info("[TieBaCrawler] Link navigates in current tab...")
|
||||
async with self.context_page.expect_navigation(wait_until="domcontentloaded"):
|
||||
await tieba_link.click()
|
||||
|
||||
# Step 5: 等待页面稳定,使用配置文件中的延时设置
|
||||
utils.logger.info(f"[TieBaCrawler] Step 5: 页面加载完成,等待 {config.CRAWLER_MAX_SLEEP_SEC}秒...")
|
||||
# Step 5: Wait for page to stabilize, using delay setting from config file
|
||||
utils.logger.info(f"[TieBaCrawler] Step 5: Page loaded, waiting {config.CRAWLER_MAX_SLEEP_SEC} seconds...")
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
current_url = self.context_page.url
|
||||
utils.logger.info(f"[TieBaCrawler] ✅ 成功通过百度首页进入贴吧! 当前URL: {current_url}")
|
||||
utils.logger.info(f"[TieBaCrawler] Successfully entered Tieba via Baidu homepage! Current URL: {current_url}")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[TieBaCrawler] 通过百度首页访问贴吧失败: {e}")
|
||||
utils.logger.info("[TieBaCrawler] 回退:直接访问贴吧首页")
|
||||
utils.logger.error(f"[TieBaCrawler] Failed to access Tieba via Baidu homepage: {e}")
|
||||
utils.logger.info("[TieBaCrawler] Fallback: directly accessing Tieba homepage")
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
|
||||
|
||||
async def _inject_anti_detection_scripts(self):
|
||||
"""
|
||||
注入反检测JavaScript脚本
|
||||
针对百度贴吧的特殊检测机制
|
||||
Inject anti-detection JavaScript scripts
|
||||
For Baidu Tieba's special detection mechanism
|
||||
"""
|
||||
utils.logger.info("[TieBaCrawler] Injecting anti-detection scripts...")
|
||||
|
||||
# 轻量级反检测脚本,只覆盖关键检测点
|
||||
# Lightweight anti-detection script, only covering key detection points
|
||||
anti_detection_js = """
|
||||
// 覆盖 navigator.webdriver
|
||||
// Override navigator.webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined,
|
||||
configurable: true
|
||||
});
|
||||
|
||||
// 覆盖 window.navigator.chrome
|
||||
// Override window.navigator.chrome
|
||||
if (!window.navigator.chrome) {
|
||||
window.navigator.chrome = {
|
||||
runtime: {},
|
||||
@@ -511,7 +511,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
};
|
||||
}
|
||||
|
||||
// 覆盖 Permissions API
|
||||
// Override Permissions API
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) => (
|
||||
parameters.name === 'notifications' ?
|
||||
@@ -519,19 +519,19 @@ class TieBaCrawler(AbstractCrawler):
|
||||
originalQuery(parameters)
|
||||
);
|
||||
|
||||
// 覆盖 plugins 长度(让它看起来有插件)
|
||||
// Override plugins length (make it look like there are plugins)
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
configurable: true
|
||||
});
|
||||
|
||||
// 覆盖 languages
|
||||
// Override languages
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['zh-CN', 'zh', 'en'],
|
||||
configurable: true
|
||||
});
|
||||
|
||||
// 移除 window.cdc_ 等 ChromeDriver 残留
|
||||
// Remove window.cdc_ and other ChromeDriver remnants
|
||||
delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
|
||||
delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
|
||||
delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
|
||||
@@ -548,21 +548,21 @@ class TieBaCrawler(AbstractCrawler):
|
||||
"""
|
||||
Create tieba client with real browser User-Agent and complete headers
|
||||
Args:
|
||||
httpx_proxy: HTTP代理
|
||||
ip_pool: IP代理池
|
||||
httpx_proxy: HTTP proxy
|
||||
ip_pool: IP proxy pool
|
||||
|
||||
Returns:
|
||||
BaiduTieBaClient实例
|
||||
BaiduTieBaClient instance
|
||||
"""
|
||||
utils.logger.info("[TieBaCrawler.create_tieba_client] Begin create tieba API client...")
|
||||
|
||||
# 从真实浏览器提取User-Agent,避免被检测
|
||||
# Extract User-Agent from real browser to avoid detection
|
||||
user_agent = await self.context_page.evaluate("() => navigator.userAgent")
|
||||
utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")
|
||||
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
|
||||
# 构建完整的浏览器请求头,模拟真实浏览器行为
|
||||
# Build complete browser request headers, simulating real browser behavior
|
||||
tieba_client = BaiduTieBaClient(
|
||||
timeout=10,
|
||||
ip_pool=ip_pool,
|
||||
@@ -572,7 +572,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"User-Agent": user_agent, # 使用真实浏览器的UA
|
||||
"User-Agent": user_agent, # Use real browser UA
|
||||
"Cookie": cookie_str,
|
||||
"Host": "tieba.baidu.com",
|
||||
"Referer": "https://tieba.baidu.com/",
|
||||
@@ -585,7 +585,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"macOS"',
|
||||
},
|
||||
playwright_page=self.context_page, # 传入playwright页面对象
|
||||
playwright_page=self.context_page, # Pass in playwright page object
|
||||
)
|
||||
return tieba_client
|
||||
|
||||
@@ -623,7 +623,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's stable Chrome version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -641,7 +641,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -652,15 +652,15 @@ class TieBaCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[TieBaCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[TieBaCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[TieBaCrawler] CDP mode launch failed, falling back to standard mode: {e}")
|
||||
# Fall back to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
@@ -672,7 +672,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# If using CDP mode, need special handling
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -23,16 +23,16 @@ from enum import Enum
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# 按时间倒序
|
||||
# Sort by time in descending order
|
||||
TIME_DESC = "1"
|
||||
# 按时间顺序
|
||||
# Sort by time in ascending order
|
||||
TIME_ASC = "0"
|
||||
# 按相关性顺序
|
||||
# Sort by relevance
|
||||
RELEVANCE_ORDER = "2"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
# 只看主题贴
|
||||
# Only view main posts
|
||||
MAIN_THREAD = "1"
|
||||
# 混合模式(帖子+回复)
|
||||
# Mixed mode (posts + replies)
|
||||
FIXED_THREAD = "0"
|
||||
|
||||
@@ -42,12 +42,12 @@ class TieBaExtractor:
|
||||
@staticmethod
|
||||
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
|
||||
Extract Tieba post list from keyword search result pages, still missing reply count and reply page data
|
||||
Args:
|
||||
page_content: 页面内容的HTML字符串
|
||||
page_content: HTML string of page content
|
||||
|
||||
Returns:
|
||||
包含帖子信息的字典列表
|
||||
List of Tieba post objects
|
||||
"""
|
||||
xpath_selector = "//div[@class='s_post']"
|
||||
post_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
@@ -71,12 +71,12 @@ class TieBaExtractor:
|
||||
|
||||
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表
|
||||
Extract Tieba post list from Tieba page
|
||||
Args:
|
||||
page_content:
|
||||
page_content: HTML string of page content
|
||||
|
||||
Returns:
|
||||
|
||||
List of Tieba post objects
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
@@ -106,21 +106,21 @@ class TieBaExtractor:
|
||||
|
||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||
"""
|
||||
提取贴吧帖子详情
|
||||
Extract Tieba post details from post detail page
|
||||
Args:
|
||||
page_content:
|
||||
page_content: HTML string of page content
|
||||
|
||||
Returns:
|
||||
|
||||
Tieba post detail object
|
||||
"""
|
||||
content_selector = Selector(text=page_content)
|
||||
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
|
||||
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
|
||||
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||
# 帖子回复数、回复页数
|
||||
# Post reply count and reply page count
|
||||
thread_num_infos = content_selector.xpath(
|
||||
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']")
|
||||
# IP地理位置、发表时间
|
||||
# IP location and publish time
|
||||
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(),
|
||||
@@ -138,18 +138,18 @@ class TieBaExtractor:
|
||||
publish_time=publish_time,
|
||||
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
|
||||
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), )
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "")
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_Baidu Tieba", "")
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子一级评论
|
||||
Extract Tieba post first-level comments from comment page
|
||||
Args:
|
||||
page_content:
|
||||
note_id:
|
||||
page_content: HTML string of page content
|
||||
note_id: Post ID
|
||||
|
||||
Returns:
|
||||
|
||||
List of first-level comment objects
|
||||
"""
|
||||
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
|
||||
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
@@ -180,13 +180,13 @@ class TieBaExtractor:
|
||||
|
||||
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Extract Tieba post second-level comments from sub-comment page
|
||||
Args:
|
||||
page_content:
|
||||
parent_comment:
|
||||
page_content: HTML string of page content
|
||||
parent_comment: Parent comment object
|
||||
|
||||
Returns:
|
||||
|
||||
List of second-level comment objects
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
@@ -215,12 +215,12 @@ class TieBaExtractor:
|
||||
|
||||
def extract_creator_info(self, html_content: str) -> TiebaCreator:
|
||||
"""
|
||||
提取贴吧创作者信息
|
||||
Extract Tieba creator information from creator homepage
|
||||
Args:
|
||||
html_content:
|
||||
html_content: HTML string of creator homepage
|
||||
|
||||
Returns:
|
||||
|
||||
Tieba creator object
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
user_link_selector = selector.xpath("//p[@class='space']/a")
|
||||
@@ -251,12 +251,12 @@ class TieBaExtractor:
|
||||
html_content: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
提取贴吧创作者主页的帖子列表
|
||||
Extract post ID list from Tieba creator's homepage
|
||||
Args:
|
||||
html_content:
|
||||
html_content: HTML string of creator homepage
|
||||
|
||||
Returns:
|
||||
|
||||
List of post IDs
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
thread_id_list = []
|
||||
@@ -271,12 +271,12 @@ class TieBaExtractor:
|
||||
|
||||
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
提取IP位置和发布时间
|
||||
Extract IP location and publish time from HTML content
|
||||
Args:
|
||||
html_content:
|
||||
html_content: HTML string
|
||||
|
||||
Returns:
|
||||
|
||||
Tuple of (IP location, publish time)
|
||||
"""
|
||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||
time_match = pattern_pub_time.search(html_content)
|
||||
@@ -286,12 +286,12 @@ class TieBaExtractor:
|
||||
@staticmethod
|
||||
def extract_ip(html_content: str) -> str:
|
||||
"""
|
||||
提取IP
|
||||
Extract IP location from HTML content
|
||||
Args:
|
||||
html_content:
|
||||
html_content: HTML string
|
||||
|
||||
Returns:
|
||||
|
||||
IP location string
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
@@ -301,28 +301,28 @@ class TieBaExtractor:
|
||||
@staticmethod
|
||||
def extract_gender(html_content: str) -> str:
|
||||
"""
|
||||
提取性别
|
||||
Extract gender from HTML content
|
||||
Args:
|
||||
html_content:
|
||||
html_content: HTML string
|
||||
|
||||
Returns:
|
||||
|
||||
Gender string ('Male', 'Female', or 'Unknown')
|
||||
"""
|
||||
if GENDER_MALE in html_content:
|
||||
return '男'
|
||||
return 'Male'
|
||||
elif GENDER_FEMALE in html_content:
|
||||
return '女'
|
||||
return '未知'
|
||||
return 'Female'
|
||||
return 'Unknown'
|
||||
|
||||
@staticmethod
|
||||
def extract_follow_and_fans(selectors: List[Selector]) -> Tuple[str, str]:
|
||||
"""
|
||||
提取关注数和粉丝数
|
||||
Extract follow count and fan count from selectors
|
||||
Args:
|
||||
selectors:
|
||||
selectors: List of selector objects
|
||||
|
||||
Returns:
|
||||
|
||||
Tuple of (follow count, fan count)
|
||||
"""
|
||||
pattern = re.compile(r'<span class="concern_num">\(<a[^>]*>(\d+)</a>\)</span>')
|
||||
follow_match = pattern.findall(selectors[0].get())
|
||||
@@ -334,9 +334,15 @@ class TieBaExtractor:
|
||||
@staticmethod
|
||||
def extract_registration_duration(html_content: str) -> str:
|
||||
"""
|
||||
"<span>吧龄:1.9年</span>"
|
||||
Returns: 1.9年
|
||||
Extract Tieba age from HTML content
|
||||
Example: "<span>吧龄:1.9年</span>"
|
||||
Returns: "1.9年"
|
||||
|
||||
Args:
|
||||
html_content: HTML string
|
||||
|
||||
Returns:
|
||||
Tieba age string
|
||||
"""
|
||||
pattern = re.compile(r'<span>吧龄:(\S+)</span>')
|
||||
match = pattern.search(html_content)
|
||||
@@ -345,22 +351,22 @@ class TieBaExtractor:
|
||||
@staticmethod
|
||||
def extract_data_field_value(selector: Selector) -> Dict:
|
||||
"""
|
||||
提取data-field的值
|
||||
Extract data-field value from selector
|
||||
Args:
|
||||
selector:
|
||||
selector: Selector object
|
||||
|
||||
Returns:
|
||||
|
||||
Dictionary containing data-field value
|
||||
"""
|
||||
data_field_value = selector.xpath("./@data-field").get(default='').strip()
|
||||
if not data_field_value or data_field_value == "{}":
|
||||
return {}
|
||||
try:
|
||||
# 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典
|
||||
# First use html.unescape to handle escape characters, then json.loads to convert JSON string to Python dictionary
|
||||
unescaped_json_str = html.unescape(data_field_value)
|
||||
data_field_dict_value = json.loads(unescaped_json_str)
|
||||
except Exception as ex:
|
||||
print(f"extract_data_field_value,错误信息:{ex}, 尝试使用其他方式解析")
|
||||
print(f"extract_data_field_value, error: {ex}, trying alternative parsing method")
|
||||
data_field_dict_value = {}
|
||||
return data_field_dict_value
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ class BaiduTieBaLogin(AbstractLogin):
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
轮训检查登录状态是否成功,成功返回True否则返回False
|
||||
Poll to check if login status is successful, return True if successful, otherwise return False
|
||||
|
||||
Returns:
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc : 微博爬虫 API 请求 client
|
||||
# @Desc : Weibo crawler API request client
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
@@ -49,7 +49,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
||||
timeout=60, # If media crawling is enabled, Weibo images need a longer timeout
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -64,12 +64,12 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
@retry(stop=stop_after_attempt(5), wait=wait_fixed(3))
|
||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
enable_return_response = kwargs.pop("return_response", False)
|
||||
@@ -82,7 +82,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except json.decoder.JSONDecodeError:
|
||||
# issue: #771 搜索接口会报错432, 多次重试 + 更新 h5 cookies
|
||||
# issue: #771 Search API returns error 432, retry multiple times + update h5 cookies
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err code: {response.status_code} res:{response.text}")
|
||||
await self.playwright_page.goto(self._host)
|
||||
await asyncio.sleep(2)
|
||||
@@ -156,9 +156,9 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
) -> Dict:
|
||||
"""
|
||||
search note by keyword
|
||||
:param keyword: 微博搜搜的关键词
|
||||
:param page: 分页参数 -当前页码
|
||||
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
|
||||
:param keyword: Search keyword for Weibo
|
||||
:param page: Pagination parameter - current page number
|
||||
:param search_type: Search type, see SearchType enum in weibo/field.py
|
||||
:return:
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
@@ -172,9 +172,9 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
|
||||
"""get notes comments
|
||||
:param mid_id: 微博ID
|
||||
:param max_id: 分页参数ID
|
||||
:param max_id_type: 分页参数ID类型
|
||||
:param mid_id: Weibo ID
|
||||
:param max_id: Pagination parameter ID
|
||||
:param max_id_type: Pagination parameter ID type
|
||||
:return:
|
||||
"""
|
||||
uri = "/comments/hotflow"
|
||||
@@ -218,7 +218,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
is_end = max_id == 0
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
if callback: # If callback function exists, execute it
|
||||
await callback(note_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comment_list)
|
||||
@@ -233,7 +233,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取评论的所有子评论
|
||||
Get all sub-comments of comments
|
||||
Args:
|
||||
note_id:
|
||||
comment_list:
|
||||
@@ -256,7 +256,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||
"""
|
||||
根据帖子ID获取详情
|
||||
Get note details by note ID
|
||||
:param note_id:
|
||||
:return:
|
||||
"""
|
||||
@@ -273,22 +273,22 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
note_item = {"mblog": note_detail}
|
||||
return note_item
|
||||
else:
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] $render_data value not found")
|
||||
return dict()
|
||||
|
||||
async def get_note_image(self, image_url: str) -> bytes:
|
||||
image_url = image_url[8:] # 去掉 https://
|
||||
image_url = image_url[8:] # Remove https://
|
||||
sub_url = image_url.split("/")
|
||||
image_url = ""
|
||||
for i in range(len(sub_url)):
|
||||
if i == 1:
|
||||
image_url += "large/" # 都获取高清大图
|
||||
image_url += "large/" # Get high-resolution images
|
||||
elif i == len(sub_url) - 1:
|
||||
image_url += sub_url[i]
|
||||
else:
|
||||
image_url += sub_url[i] + "/"
|
||||
# 微博图床对外存在防盗链,所以需要代理访问
|
||||
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
||||
# Weibo image hosting has anti-hotlinking, so proxy access is needed
|
||||
# Since Weibo images are accessed through i1.wp.com, we need to concatenate the URL
|
||||
final_uri = (f"{self._image_agent_host}"
|
||||
f"{image_url}")
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -301,18 +301,18 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # Keep original exception type name for developer debugging
|
||||
return None
|
||||
|
||||
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
获取用户的容器ID, 容器信息代表着真实请求的API路径
|
||||
fid_container_id:用户的微博详情API的容器ID
|
||||
lfid_container_id:用户的微博列表API的容器ID
|
||||
Get user's container ID, container information represents the real API request path
|
||||
fid_container_id: Container ID for user's Weibo detail API
|
||||
lfid_container_id: Container ID for user's Weibo list API
|
||||
Args:
|
||||
creator_id:
|
||||
creator_id: User ID
|
||||
|
||||
Returns: {
|
||||
Returns: Dictionary with container IDs
|
||||
|
||||
"""
|
||||
response = await self.get(f"/u/{creator_id}", return_response=True)
|
||||
@@ -324,7 +324,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
根据用户ID获取用户详情
|
||||
Get user details by user ID
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
@@ -349,11 +349,11 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
since_id: str = "0",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Get creator's notes
|
||||
Args:
|
||||
creator: 博主ID
|
||||
container_id: 容器ID
|
||||
since_id: 上一页最后一条笔记的ID
|
||||
creator: Creator ID
|
||||
container_id: Container ID
|
||||
since_id: ID of the last note from previous page
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -376,14 +376,14 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Get all posts published by a specified user, this method will continuously fetch all posts from a user
|
||||
Args:
|
||||
creator_id:
|
||||
container_id:
|
||||
crawl_interval:
|
||||
callback:
|
||||
creator_id: Creator user ID
|
||||
container_id: Container ID for the user
|
||||
crawl_interval: Interval between requests in seconds
|
||||
callback: Optional callback function to process notes
|
||||
|
||||
Returns:
|
||||
Returns: List of all notes
|
||||
|
||||
"""
|
||||
result = []
|
||||
@@ -393,7 +393,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
while notes_has_more:
|
||||
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by Weibo, so they cannot access the data.")
|
||||
break
|
||||
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
|
||||
if "cards" not in notes_res:
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc : 微博爬虫主流程代码
|
||||
# @Desc : Weibo crawler main workflow code
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
@@ -63,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -73,9 +73,9 @@ class WeiboCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Select launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[WeiboCrawler] Launching browser with CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -83,7 +83,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[WeiboCrawler] Launching browser with standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
|
||||
@@ -109,11 +109,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||
)
|
||||
await login_obj.begin()
|
||||
|
||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||
# After successful login, redirect to mobile website and update mobile cookies
|
||||
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
await asyncio.sleep(3)
|
||||
# 只获取移动端的 cookies,避免 PC 端和移动端 cookies 混淆
|
||||
# Only get mobile cookies to avoid confusion between PC and mobile cookies
|
||||
await self.wb_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=[self.mobile_index_url]
|
||||
@@ -170,7 +170,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
# 如果开启了全文获取功能,则批量获取帖子全文
|
||||
# If full text fetching is enabled, batch get full text of posts
|
||||
note_list = await self.batch_get_notes_full_text(note_list)
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
@@ -315,9 +315,9 @@ class WeiboCrawler(AbstractCrawler):
|
||||
raise DataFetchError("Get creator info error")
|
||||
await weibo_store.save_creator(user_id, user_info=createor_info)
|
||||
|
||||
# 创建一个包装 callback,在保存数据前获取全文
|
||||
# Create a wrapper callback to get full text before saving data
|
||||
async def save_notes_with_full_text(note_list: List[Dict]):
|
||||
# 如果开启了全文获取功能,先批量获取全文
|
||||
# If full text fetching is enabled, batch get full text first
|
||||
updated_note_list = await self.batch_get_notes_full_text(note_list)
|
||||
await weibo_store.batch_update_weibo_notes(updated_note_list)
|
||||
|
||||
@@ -350,7 +350,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return weibo_client_obj
|
||||
|
||||
@@ -375,7 +375,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system's Chrome stable version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -391,7 +391,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser with CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -402,24 +402,24 @@ class WeiboCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[WeiboCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[WeiboCrawler] CDP mode startup failed, falling back to standard mode: {e}")
|
||||
# Fallback to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def get_note_full_text(self, note_item: Dict) -> Dict:
|
||||
"""
|
||||
获取帖子全文内容
|
||||
如果帖子内容被截断(isLongText=True),则请求详情接口获取完整内容
|
||||
:param note_item: 帖子数据,包含 mblog 字段
|
||||
:return: 更新后的帖子数据
|
||||
Get full text content of a post
|
||||
If the post content is truncated (isLongText=True), request the detail API to get complete content
|
||||
:param note_item: Post data, contains mblog field
|
||||
:return: Updated post data
|
||||
"""
|
||||
if not config.ENABLE_WEIBO_FULL_TEXT:
|
||||
return note_item
|
||||
@@ -428,7 +428,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
if not mblog:
|
||||
return note_item
|
||||
|
||||
# 检查是否是长文本
|
||||
# Check if it's a long text
|
||||
is_long_text = mblog.get("isLongText", False)
|
||||
if not is_long_text:
|
||||
return note_item
|
||||
@@ -441,11 +441,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_full_text] Fetching full text for note: {note_id}")
|
||||
full_note = await self.wb_client.get_note_info_by_id(note_id)
|
||||
if full_note and full_note.get("mblog"):
|
||||
# 用完整内容替换原始内容
|
||||
# Replace original content with complete content
|
||||
note_item["mblog"] = full_note["mblog"]
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_full_text] Successfully fetched full text for note: {note_id}")
|
||||
|
||||
# 请求后休眠,避免风控
|
||||
# Sleep after request to avoid rate limiting
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_full_text] Failed to fetch full text for note {note_id}: {ex}")
|
||||
@@ -456,9 +456,9 @@ class WeiboCrawler(AbstractCrawler):
|
||||
|
||||
async def batch_get_notes_full_text(self, note_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
批量获取帖子全文内容
|
||||
:param note_list: 帖子列表
|
||||
:return: 更新后的帖子列表
|
||||
Batch get full text content of posts
|
||||
:param note_list: List of posts
|
||||
:return: Updated list of posts
|
||||
"""
|
||||
if not config.ENABLE_WEIBO_FULL_TEXT:
|
||||
return note_list
|
||||
@@ -471,7 +471,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# Special handling if using CDP mode
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -26,14 +26,14 @@ from enum import Enum
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
# 综合
|
||||
# Comprehensive
|
||||
DEFAULT = "1"
|
||||
|
||||
# 实时
|
||||
# Real-time
|
||||
REAL_TIME = "61"
|
||||
|
||||
# 热门
|
||||
# Popular
|
||||
POPULAR = "60"
|
||||
|
||||
# 视频
|
||||
# Video
|
||||
VIDEO = "64"
|
||||
|
||||
@@ -28,9 +28,9 @@ from typing import Dict, List
|
||||
|
||||
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
过滤微博搜索的结果,只保留card_type为9类型的数据
|
||||
:param card_list:
|
||||
:return:
|
||||
Filter Weibo search results, only keep data with card_type of 9
|
||||
:param card_list: List of card items from search results
|
||||
:return: Filtered list of note items
|
||||
"""
|
||||
note_list: List[Dict] = []
|
||||
for card_item in card_list:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:42
|
||||
# @Desc : 微博登录实现
|
||||
# @Desc : Weibo login implementation
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
|
||||
@@ -45,7 +45,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
|
||||
timeout=60, # If media crawling is enabled, Xiaohongshu long videos need longer timeout
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -58,30 +58,30 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.headers = headers
|
||||
self._host = "https://edith.xiaohongshu.com"
|
||||
self._domain = "https://www.xiaohongshu.com"
|
||||
self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
|
||||
self.IP_ERROR_STR = "Network connection error, please check network settings or restart"
|
||||
self.IP_ERROR_CODE = 300012
|
||||
self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
|
||||
self.NOTE_ABNORMAL_STR = "Note status abnormal, please check later"
|
||||
self.NOTE_ABNORMAL_CODE = -510001
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = XiaoHongShuExtractor()
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
|
||||
"""请求头参数签名(使用 playwright 注入方式)
|
||||
"""Request header parameter signing (using playwright injection method)
|
||||
|
||||
Args:
|
||||
url: 请求的URL
|
||||
params: GET请求的参数
|
||||
payload: POST请求的参数
|
||||
url: Request URL
|
||||
params: GET request parameters
|
||||
payload: POST request parameters
|
||||
|
||||
Returns:
|
||||
Dict: 请求头参数签名
|
||||
Dict: Signed request header parameters
|
||||
"""
|
||||
a1_value = self.cookie_dict.get("a1", "")
|
||||
|
||||
# 确定请求数据、方法和 URI
|
||||
# Determine request data, method and URI
|
||||
if params is not None:
|
||||
data = params
|
||||
method = "GET"
|
||||
@@ -91,7 +91,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
else:
|
||||
raise ValueError("params or payload is required")
|
||||
|
||||
# 使用 playwright 注入方式生成签名
|
||||
# Generate signature using playwright injection method
|
||||
signs = await sign_with_playwright(
|
||||
page=self.playwright_page,
|
||||
uri=url,
|
||||
@@ -112,16 +112,16 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Wrapper for httpx common request method, processes request response
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
method: Request method
|
||||
url: Request URL
|
||||
**kwargs: Other request parameters, such as headers, body, etc.
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
# return response.text
|
||||
@@ -133,7 +133,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
# someday someone maybe will bypass captcha
|
||||
verify_type = response.headers["Verifytype"]
|
||||
verify_uuid = response.headers["Verifyuuid"]
|
||||
msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
||||
msg = f"CAPTCHA appeared, request failed, Verifytype: {verify_type}, Verifyuuid: {verify_uuid}, Response: {response}"
|
||||
utils.logger.error(msg)
|
||||
raise Exception(msg)
|
||||
|
||||
@@ -150,10 +150,10 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
GET request, signs request headers
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
uri: Request route
|
||||
params: Request parameters
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -167,10 +167,10 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
POST request, signs request headers
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
uri: Request route
|
||||
data: Request body parameters
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -186,7 +186,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
)
|
||||
|
||||
async def get_note_media(self, url: str) -> Union[bytes, None]:
|
||||
# 请求前检测代理是否过期
|
||||
# Check if proxy is expired before request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
@@ -205,12 +205,12 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}"
|
||||
) # 保留原始异常类型名称,以便开发者调试
|
||||
) # Keep original exception type name for developer debugging
|
||||
return None
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Check if login state is still valid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -218,7 +218,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
|
||||
ping_flag = False
|
||||
try:
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="Xiaohongshu")
|
||||
if note_card.get("items"):
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
@@ -230,9 +230,9 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Update cookies method provided by API client, usually called after successful login
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
browser_context: Browser context object
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -251,13 +251,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
note_type: SearchNoteType = SearchNoteType.ALL,
|
||||
) -> Dict:
|
||||
"""
|
||||
根据关键词搜索笔记
|
||||
Search notes by keyword
|
||||
Args:
|
||||
keyword: 关键词参数
|
||||
page: 分页第几页
|
||||
page_size: 分页数据长度
|
||||
sort: 搜索结果排序指定
|
||||
note_type: 搜索的笔记类型
|
||||
keyword: Keyword parameter
|
||||
page: Page number
|
||||
page_size: Page data length
|
||||
sort: Search result sorting specification
|
||||
note_type: Type of note to search
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -280,11 +280,11 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
xsec_token: str,
|
||||
) -> Dict:
|
||||
"""
|
||||
获取笔记详情API
|
||||
Get note detail API
|
||||
Args:
|
||||
note_id:笔记ID
|
||||
xsec_source: 渠道来源
|
||||
xsec_token: 搜索关键字之后返回的比较列表中返回的token
|
||||
note_id: Note ID
|
||||
xsec_source: Channel source
|
||||
xsec_token: Token returned from search keyword result list
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -304,7 +304,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
if res and res.get("items"):
|
||||
res_dict: Dict = res["items"][0]["note_card"]
|
||||
return res_dict
|
||||
# 爬取频繁了可能会出现有的笔记能有结果有的没有
|
||||
# When crawling frequently, some notes may have results while others don't
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}"
|
||||
)
|
||||
@@ -317,11 +317,11 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
cursor: str = "",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论的API
|
||||
Get first-level comments API
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
xsec_token: 验证token
|
||||
cursor: 分页游标
|
||||
note_id: Note ID
|
||||
xsec_token: Verification token
|
||||
cursor: Pagination cursor
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -345,13 +345,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
cursor: str = "",
|
||||
):
|
||||
"""
|
||||
获取指定父评论下的子评论的API
|
||||
Get sub-comments under specified parent comment API
|
||||
Args:
|
||||
note_id: 子评论的帖子ID
|
||||
root_comment_id: 根评论ID
|
||||
xsec_token: 验证token
|
||||
num: 分页数量
|
||||
cursor: 分页游标
|
||||
note_id: Post ID of sub-comments
|
||||
root_comment_id: Root comment ID
|
||||
xsec_token: Verification token
|
||||
num: Pagination quantity
|
||||
cursor: Pagination cursor
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -377,13 +377,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
max_count: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Get all first-level comments under specified note, this method will continuously find all comment information under a post
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
xsec_token: 验证token
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
note_id: Note ID
|
||||
xsec_token: Verification token
|
||||
crawl_interval: Crawl delay per note (seconds)
|
||||
callback: Callback after one note crawl ends
|
||||
max_count: Maximum number of comments to crawl per note
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -425,12 +425,12 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Get all second-level comments under specified first-level comments, this method will continuously find all second-level comment information under first-level comments
|
||||
Args:
|
||||
comments: 评论列表
|
||||
xsec_token: 验证token
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
comments: Comment list
|
||||
xsec_token: Verification token
|
||||
crawl_interval: Crawl delay per comment (seconds)
|
||||
callback: Callback after one comment crawl ends
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -487,18 +487,18 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self, user_id: str, xsec_token: str = "", xsec_source: str = ""
|
||||
) -> Dict:
|
||||
"""
|
||||
通过解析网页版的用户主页HTML,获取用户个人简要信息
|
||||
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
|
||||
Get user profile brief information by parsing user homepage HTML
|
||||
The PC user homepage has window.__INITIAL_STATE__ variable, just parse it
|
||||
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
xsec_token: 验证token (可选,如果URL中包含此参数则传入)
|
||||
xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入)
|
||||
user_id: User ID
|
||||
xsec_token: Verification token (optional, pass if included in URL)
|
||||
xsec_source: Channel source (optional, pass if included in URL)
|
||||
|
||||
Returns:
|
||||
Dict: 创作者信息
|
||||
Dict: Creator information
|
||||
"""
|
||||
# 构建URI,如果有xsec参数则添加到URL中
|
||||
# Build URI, add xsec parameters to URL if available
|
||||
uri = f"/user/profile/{user_id}"
|
||||
if xsec_token and xsec_source:
|
||||
uri = f"{uri}?xsec_token={xsec_token}&xsec_source={xsec_source}"
|
||||
@@ -517,13 +517,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
xsec_source: str = "pc_feed",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Get creator's notes
|
||||
Args:
|
||||
creator: 博主ID
|
||||
cursor: 上一页最后一条笔记的ID
|
||||
page_size: 分页数据长度
|
||||
xsec_token: 验证token
|
||||
xsec_source: 渠道来源
|
||||
creator: Creator ID
|
||||
cursor: Last note ID from previous page
|
||||
page_size: Page data length
|
||||
xsec_token: Verification token
|
||||
xsec_source: Channel source
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -547,13 +547,13 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
xsec_source: str = "pc_feed",
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Get all posts published by specified user, this method will continuously find all post information under a user
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
xsec_token: 验证token
|
||||
xsec_source: 渠道来源
|
||||
user_id: User ID
|
||||
crawl_interval: Crawl delay (seconds)
|
||||
callback: Update callback function after one pagination crawl ends
|
||||
xsec_token: Verification token
|
||||
xsec_source: Channel source
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -602,9 +602,9 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_note_short_url(self, note_id: str) -> Dict:
|
||||
"""
|
||||
获取笔记的短链接
|
||||
Get note short URL
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
note_id: Note ID
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -622,7 +622,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
enable_cookie: bool = False,
|
||||
) -> Optional[Dict]:
|
||||
"""
|
||||
通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次
|
||||
Get note details by parsing note detail page HTML, this interface may fail, retry 3 times here
|
||||
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
|
||||
thanks for ReaJason
|
||||
Args:
|
||||
|
||||
@@ -60,7 +60,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
@@ -70,9 +70,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[XiaoHongShuCrawler] Launching browser using CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -80,7 +80,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[XiaoHongShuCrawler] Launching browser using standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
@@ -95,7 +95,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
# Create a client to interact with the Xiaohongshu website.
|
||||
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
||||
if not await self.xhs_client.pong():
|
||||
login_obj = XiaoHongShuLogin(
|
||||
@@ -125,8 +125,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] Begin search Xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # Xiaohongshu limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||
start_page = config.START_PAGE
|
||||
@@ -142,7 +142,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
continue
|
||||
|
||||
try:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] search Xiaohongshu keyword: {keyword}, page: {page}")
|
||||
note_ids: List[str] = []
|
||||
xsec_tokens: List[str] = []
|
||||
notes_res = await self.xhs_client.get_note_by_keyword(
|
||||
@@ -151,9 +151,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
page=page,
|
||||
sort=(SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != "" else SearchSortType.GENERAL),
|
||||
)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes response: {notes_res}")
|
||||
if not notes_res or not notes_res.get("has_more", False):
|
||||
utils.logger.info("No more content!")
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] No more content!")
|
||||
break
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
@@ -184,7 +184,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""Get creator's notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
|
||||
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get Xiaohongshu creators")
|
||||
for creator_url in config.XHS_CREATOR_ID_LIST:
|
||||
try:
|
||||
# Parse creator URL to get user_id and security tokens
|
||||
@@ -223,9 +223,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await self.batch_get_note_comments(note_ids, xsec_tokens)
|
||||
|
||||
async def fetch_creator_notes_detail(self, note_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
"""Concurrently obtain the specified post list and save the data"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(
|
||||
@@ -243,11 +241,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await self.get_notice_media(note_detail)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️
|
||||
Returns:
|
||||
"""Get the information and comments of the specified post
|
||||
|
||||
Note: Must specify note_id, xsec_source, xsec_token
|
||||
"""
|
||||
get_note_detail_task_list = []
|
||||
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
|
||||
@@ -356,8 +352,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for note {note_id}")
|
||||
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||
"""Create Xiaohongshu client"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create Xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = XiaoHongShuClient(
|
||||
proxy=httpx_proxy,
|
||||
@@ -381,7 +377,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return xhs_client_obj
|
||||
|
||||
@@ -422,9 +418,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
"""Launch browser using CDP mode"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
@@ -434,21 +428,21 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[XiaoHongShuCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[XiaoHongShuCrawler] CDP mode launch failed, falling back to standard mode: {e}")
|
||||
# Fall back to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# Special handling if using CDP mode
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
@@ -464,10 +458,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await self.get_notice_video(note_detail)
|
||||
|
||||
async def get_note_images(self, note_item: Dict):
|
||||
"""
|
||||
get note images. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""Get note images. Please use get_notice_media
|
||||
|
||||
Args:
|
||||
note_item: Note item dictionary
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
@@ -494,10 +488,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||
|
||||
async def get_notice_video(self, note_item: Dict):
|
||||
"""
|
||||
get note videos. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""Get note videos. Please use get_notice_media
|
||||
|
||||
Args:
|
||||
note_item: Note item dictionary
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
|
||||
@@ -29,16 +29,16 @@ class XiaoHongShuExtractor:
|
||||
pass
|
||||
|
||||
def extract_note_detail_from_html(self, note_id: str, html: str) -> Optional[Dict]:
|
||||
"""从html中提取笔记详情
|
||||
"""Extract note details from HTML
|
||||
|
||||
Args:
|
||||
html (str): html字符串
|
||||
html (str): HTML string
|
||||
|
||||
Returns:
|
||||
Dict: 笔记详情字典
|
||||
Dict: Note details dictionary
|
||||
"""
|
||||
if "noteDetailMap" not in html:
|
||||
# 这种情况要么是出了验证码了,要么是笔记不存在
|
||||
# Either a CAPTCHA appeared or the note doesn't exist
|
||||
return None
|
||||
|
||||
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[
|
||||
@@ -50,13 +50,13 @@ class XiaoHongShuExtractor:
|
||||
return None
|
||||
|
||||
def extract_creator_info_from_html(self, html: str) -> Optional[Dict]:
|
||||
"""从html中提取用户信息
|
||||
"""Extract user information from HTML
|
||||
|
||||
Args:
|
||||
html (str): html字符串
|
||||
html (str): HTML string
|
||||
|
||||
Returns:
|
||||
Dict: 用户信息字典
|
||||
Dict: User information dictionary
|
||||
"""
|
||||
match = re.search(
|
||||
r"<script>window.__INITIAL_STATE__=(.+)<\/script>", html, re.M
|
||||
|
||||
@@ -23,27 +23,27 @@ from typing import NamedTuple
|
||||
|
||||
|
||||
class FeedType(Enum):
|
||||
# 推荐
|
||||
# Recommend
|
||||
RECOMMEND = "homefeed_recommend"
|
||||
# 穿搭
|
||||
# Fashion
|
||||
FASION = "homefeed.fashion_v3"
|
||||
# 美食
|
||||
# Food
|
||||
FOOD = "homefeed.food_v3"
|
||||
# 彩妆
|
||||
# Cosmetics
|
||||
COSMETICS = "homefeed.cosmetics_v3"
|
||||
# 影视
|
||||
# Movie and TV
|
||||
MOVIE = "homefeed.movie_and_tv_v3"
|
||||
# 职场
|
||||
# Career
|
||||
CAREER = "homefeed.career_v3"
|
||||
# 情感
|
||||
# Emotion
|
||||
EMOTION = "homefeed.love_v3"
|
||||
# 家居
|
||||
# Home
|
||||
HOURSE = "homefeed.household_product_v3"
|
||||
# 游戏
|
||||
# Gaming
|
||||
GAME = "homefeed.gaming_v3"
|
||||
# 旅行
|
||||
# Travel
|
||||
TRAVEL = "homefeed.travel_v3"
|
||||
# 健身
|
||||
# Fitness
|
||||
FITNESS = "homefeed.fitness_v3"
|
||||
|
||||
|
||||
@@ -53,28 +53,27 @@ class NoteType(Enum):
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# default
|
||||
"""Search sort type"""
|
||||
# Default
|
||||
GENERAL = "general"
|
||||
# most popular
|
||||
# Most popular
|
||||
MOST_POPULAR = "popularity_descending"
|
||||
# Latest
|
||||
LATEST = "time_descending"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
"""search note type
|
||||
"""
|
||||
# default
|
||||
"""Search note type"""
|
||||
# Default
|
||||
ALL = 0
|
||||
# only video
|
||||
# Only video
|
||||
VIDEO = 1
|
||||
# only image
|
||||
# Only image
|
||||
IMAGE = 2
|
||||
|
||||
|
||||
class Note(NamedTuple):
|
||||
"""note tuple"""
|
||||
"""Note tuple"""
|
||||
note_id: str
|
||||
title: str
|
||||
desc: str
|
||||
|
||||
@@ -297,13 +297,13 @@ def get_img_urls_by_trace_id(trace_id: str, format_type: str = "png"):
|
||||
|
||||
|
||||
def get_trace_id(img_url: str):
|
||||
# 浏览器端上传的图片多了 /spectrum/ 这个路径
|
||||
# Browser-uploaded images have an additional /spectrum/ path
|
||||
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
|
||||
|
||||
|
||||
def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
||||
"""
|
||||
从小红书笔记url中解析出笔记信息
|
||||
Parse note information from Xiaohongshu note URL
|
||||
Args:
|
||||
url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
Returns:
|
||||
@@ -318,44 +318,44 @@ def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从小红书创作者主页URL中解析出创作者信息
|
||||
支持以下格式:
|
||||
1. 完整URL: "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||
2. 纯ID: "5eb8e1d400000000010075ae"
|
||||
Parse creator information from Xiaohongshu creator homepage URL
|
||||
Supports the following formats:
|
||||
1. Full URL: "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||
2. Pure ID: "5eb8e1d400000000010075ae"
|
||||
|
||||
Args:
|
||||
url: 创作者主页URL或user_id
|
||||
url: Creator homepage URL or user_id
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含user_id, xsec_token, xsec_source的对象
|
||||
CreatorUrlInfo: Object containing user_id, xsec_token, xsec_source
|
||||
"""
|
||||
# 如果是纯ID格式(24位十六进制字符),直接返回
|
||||
# If it's a pure ID format (24 hexadecimal characters), return directly
|
||||
if len(url) == 24 and all(c in "0123456789abcdef" for c in url):
|
||||
return CreatorUrlInfo(user_id=url, xsec_token="", xsec_source="")
|
||||
|
||||
# 从URL中提取user_id: /user/profile/xxx
|
||||
# Extract user_id from URL: /user/profile/xxx
|
||||
import re
|
||||
user_pattern = r'/user/profile/([^/?]+)'
|
||||
match = re.search(user_pattern, url)
|
||||
if match:
|
||||
user_id = match.group(1)
|
||||
# 提取xsec_token和xsec_source参数
|
||||
# Extract xsec_token and xsec_source parameters
|
||||
params = extract_url_params_to_dict(url)
|
||||
xsec_token = params.get("xsec_token", "")
|
||||
xsec_source = params.get("xsec_source", "")
|
||||
return CreatorUrlInfo(user_id=user_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者信息: {url}")
|
||||
raise ValueError(f"Unable to parse creator info from URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
||||
# 获取一个图片地址在多个cdn下的url地址
|
||||
# Get image URL addresses under multiple CDNs for a single image
|
||||
# final_img_urls = get_img_urls_by_trace_id(get_trace_id(_img_url))
|
||||
final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
|
||||
print(final_img_url)
|
||||
|
||||
# 测试创作者URL解析
|
||||
print("\n=== 创作者URL解析测试 ===")
|
||||
# Test creator URL parsing
|
||||
print("\n=== Creator URL Parsing Test ===")
|
||||
test_creator_urls = [
|
||||
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
|
||||
"5eb8e1d400000000010075ae",
|
||||
@@ -364,7 +364,7 @@ if __name__ == '__main__':
|
||||
try:
|
||||
result = parse_creator_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
print(f" Result: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
print(f" Error: {e}\n")
|
||||
|
||||
@@ -57,7 +57,7 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||
"""
|
||||
|
||||
if "请通过验证" in await self.context_page.content():
|
||||
utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||||
utils.logger.info("[XiaoHongShuLogin.check_login_state] CAPTCHA appeared during login, please verify manually")
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
@@ -83,14 +83,14 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||||
# After entering Xiaohongshu homepage, the login dialog may not pop up automatically, need to manually click login button
|
||||
login_button_ele = await self.context_page.wait_for_selector(
|
||||
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
|
||||
timeout=5000
|
||||
)
|
||||
await login_button_ele.click()
|
||||
# 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的
|
||||
# 另一种是需要点击切换到手机登录的
|
||||
# The login dialog has two forms: one shows phone number and verification code directly
|
||||
# The other requires clicking to switch to phone login
|
||||
element = await self.context_page.wait_for_selector(
|
||||
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
|
||||
timeout=5000
|
||||
@@ -106,11 +106,11 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
|
||||
await send_btn_ele.click() # 点击发送验证码
|
||||
await send_btn_ele.click() # Click to send verification code
|
||||
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
||||
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
max_get_sms_code_time = 60 * 2 # Maximum time to get verification code is 2 minutes
|
||||
no_logged_in_session = ""
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
@@ -125,15 +125,15 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("web_session")
|
||||
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode()) # Enter SMS verification code
|
||||
await asyncio.sleep(0.5)
|
||||
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
||||
await agree_privacy_ele.click() # 点击同意隐私协议
|
||||
await agree_privacy_ele.click() # Click to agree to privacy policy
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
await submit_btn_ele.click() # Click login
|
||||
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
# TODO: Should also check if the verification code is correct, as it may be incorrect
|
||||
break
|
||||
|
||||
try:
|
||||
@@ -196,7 +196,7 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||
"""login xiaohongshu website by cookies"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
if key != "web_session": # only set web_session cookie attr
|
||||
if key != "web_session": # Only set web_session cookie attribute
|
||||
continue
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 通过 Playwright 注入调用 window.mnsv2 生成小红书签名
|
||||
# Generate Xiaohongshu signature by calling window.mnsv2 via Playwright injection
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
@@ -30,18 +30,18 @@ from .xhs_sign import b64_encode, encode_utf8, get_trace_id, mrc
|
||||
|
||||
|
||||
def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method: str = "POST") -> str:
|
||||
"""构建待签名字符串
|
||||
|
||||
"""Build string to be signed
|
||||
|
||||
Args:
|
||||
uri: API路径
|
||||
data: 请求数据
|
||||
method: 请求方法 (GET 或 POST)
|
||||
|
||||
uri: API path
|
||||
data: Request data
|
||||
method: Request method (GET or POST)
|
||||
|
||||
Returns:
|
||||
待签名字符串
|
||||
String to be signed
|
||||
"""
|
||||
if method.upper() == "POST":
|
||||
# POST 请求使用 JSON 格式
|
||||
# POST request uses JSON format
|
||||
c = uri
|
||||
if data is not None:
|
||||
if isinstance(data, dict):
|
||||
@@ -50,10 +50,10 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
c += data
|
||||
return c
|
||||
else:
|
||||
# GET 请求使用查询字符串格式
|
||||
# GET request uses query string format
|
||||
if not data or (isinstance(data, dict) and len(data) == 0):
|
||||
return uri
|
||||
|
||||
|
||||
if isinstance(data, dict):
|
||||
params = []
|
||||
for key in data.keys():
|
||||
@@ -64,8 +64,8 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
value_str = str(value)
|
||||
else:
|
||||
value_str = ""
|
||||
# 使用URL编码(safe参数保留某些字符不编码)
|
||||
# 注意:httpx会对逗号、等号等字符进行编码,我们也需要同样处理
|
||||
# Use URL encoding (safe parameter preserves certain characters from encoding)
|
||||
# Note: httpx will encode commas, equals signs, etc., we need to handle the same way
|
||||
value_str = quote(value_str, safe='')
|
||||
params.append(f"{key}={value_str}")
|
||||
return f"{uri}?{'&'.join(params)}"
|
||||
@@ -75,12 +75,12 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
|
||||
|
||||
def _md5_hex(s: str) -> str:
|
||||
"""计算 MD5 哈希值"""
|
||||
"""Calculate MD5 hash value"""
|
||||
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _build_xs_payload(x3_value: str, data_type: str = "object") -> str:
|
||||
"""构建 x-s 签名"""
|
||||
"""Build x-s signature"""
|
||||
s = {
|
||||
"x0": "4.2.1",
|
||||
"x1": "xhs-pc-web",
|
||||
@@ -92,7 +92,7 @@ def _build_xs_payload(x3_value: str, data_type: str = "object") -> str:
|
||||
|
||||
|
||||
def _build_xs_common(a1: str, b1: str, x_s: str, x_t: str) -> str:
|
||||
"""构建 x-s-common 请求头"""
|
||||
"""Build x-s-common request header"""
|
||||
payload = {
|
||||
"s0": 3,
|
||||
"s1": "",
|
||||
@@ -113,7 +113,7 @@ def _build_xs_common(a1: str, b1: str, x_s: str, x_t: str) -> str:
|
||||
|
||||
|
||||
async def get_b1_from_localstorage(page: Page) -> str:
|
||||
"""从 localStorage 获取 b1 值"""
|
||||
"""Get b1 value from localStorage"""
|
||||
try:
|
||||
local_storage = await page.evaluate("() => window.localStorage")
|
||||
return local_storage.get("b1", "")
|
||||
@@ -123,15 +123,15 @@ async def get_b1_from_localstorage(page: Page) -> str:
|
||||
|
||||
async def call_mnsv2(page: Page, sign_str: str, md5_str: str) -> str:
|
||||
"""
|
||||
通过 playwright 调用 window.mnsv2 函数
|
||||
Call window.mnsv2 function via playwright
|
||||
|
||||
Args:
|
||||
page: playwright Page 对象
|
||||
sign_str: 待签名字符串 (uri + JSON.stringify(data))
|
||||
md5_str: sign_str 的 MD5 哈希值
|
||||
page: playwright Page object
|
||||
sign_str: String to be signed (uri + JSON.stringify(data))
|
||||
md5_str: MD5 hash value of sign_str
|
||||
|
||||
Returns:
|
||||
mnsv2 返回的签名字符串
|
||||
Signature string returned by mnsv2
|
||||
"""
|
||||
sign_str_escaped = sign_str.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
|
||||
md5_str_escaped = md5_str.replace("\\", "\\\\").replace("'", "\\'")
|
||||
@@ -150,16 +150,16 @@ async def sign_xs_with_playwright(
|
||||
method: str = "POST",
|
||||
) -> str:
|
||||
"""
|
||||
通过 playwright 注入生成 x-s 签名
|
||||
Generate x-s signature via playwright injection
|
||||
|
||||
Args:
|
||||
page: playwright Page 对象(必须已打开小红书页面)
|
||||
uri: API 路径,如 "/api/sns/web/v1/search/notes"
|
||||
data: 请求数据(GET 的 params 或 POST 的 payload)
|
||||
method: 请求方法 (GET 或 POST)
|
||||
page: playwright Page object (must have Xiaohongshu page open)
|
||||
uri: API path, e.g., "/api/sns/web/v1/search/notes"
|
||||
data: Request data (GET params or POST payload)
|
||||
method: Request method (GET or POST)
|
||||
|
||||
Returns:
|
||||
x-s 签名字符串
|
||||
x-s signature string
|
||||
"""
|
||||
sign_str = _build_sign_string(uri, data, method)
|
||||
md5_str = _md5_hex(sign_str)
|
||||
@@ -176,17 +176,17 @@ async def sign_with_playwright(
|
||||
method: str = "POST",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
通过 playwright 生成完整的签名请求头
|
||||
Generate complete signature request headers via playwright
|
||||
|
||||
Args:
|
||||
page: playwright Page 对象(必须已打开小红书页面)
|
||||
uri: API 路径
|
||||
data: 请求数据
|
||||
a1: cookie 中的 a1 值
|
||||
method: 请求方法 (GET 或 POST)
|
||||
page: playwright Page object (must have Xiaohongshu page open)
|
||||
uri: API path
|
||||
data: Request data
|
||||
a1: a1 value from cookie
|
||||
method: Request method (GET or POST)
|
||||
|
||||
Returns:
|
||||
包含 x-s, x-t, x-s-common, x-b3-traceid 的字典
|
||||
Dictionary containing x-s, x-t, x-s-common, x-b3-traceid
|
||||
"""
|
||||
b1 = await get_b1_from_localstorage(page)
|
||||
x_s = await sign_xs_with_playwright(page, uri, data, method)
|
||||
@@ -208,23 +208,23 @@ async def pre_headers_with_playwright(
|
||||
payload: Optional[Dict] = None,
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
使用 playwright 注入方式生成请求头签名
|
||||
可直接替换 client.py 中的 _pre_headers 方法
|
||||
Generate request header signature using playwright injection method
|
||||
Can directly replace _pre_headers method in client.py
|
||||
|
||||
Args:
|
||||
page: playwright Page 对象
|
||||
url: 请求 URL
|
||||
cookie_dict: cookie 字典
|
||||
params: GET 请求参数
|
||||
payload: POST 请求参数
|
||||
page: playwright Page object
|
||||
url: Request URL
|
||||
cookie_dict: Cookie dictionary
|
||||
params: GET request parameters
|
||||
payload: POST request parameters
|
||||
|
||||
Returns:
|
||||
签名后的请求头字典
|
||||
Signed request header dictionary
|
||||
"""
|
||||
a1_value = cookie_dict.get("a1", "")
|
||||
uri = urlparse(url).path
|
||||
|
||||
# 确定请求数据和方法
|
||||
# Determine request data and method
|
||||
if params is not None:
|
||||
data = params
|
||||
method = "GET"
|
||||
|
||||
@@ -16,19 +16,19 @@
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 小红书签名算法核心函数
|
||||
# 用于 playwright 注入方式生成签名
|
||||
# Xiaohongshu signature algorithm core functions
|
||||
# Used for generating signatures via playwright injection
|
||||
|
||||
import ctypes
|
||||
import random
|
||||
from urllib.parse import quote
|
||||
|
||||
# 自定义 Base64 字符表
|
||||
# 标准 Base64: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/
|
||||
# 小红书打乱顺序用于混淆
|
||||
# Custom Base64 character table
|
||||
# Standard Base64: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/
|
||||
# Xiaohongshu shuffled order for obfuscation
|
||||
BASE64_CHARS = list("ZmserbBoHQtNP+wOcza/LpngG8yJq42KWYj0DSfdikx3VT16IlUAFM97hECvuRX5")
|
||||
|
||||
# CRC32 查表
|
||||
# CRC32 lookup table
|
||||
CRC32_TABLE = [
|
||||
0, 1996959894, 3993919788, 2567524794, 124634137, 1886057615, 3915621685,
|
||||
2657392035, 249268274, 2044508324, 3772115230, 2547177864, 162941995,
|
||||
@@ -77,14 +77,14 @@ CRC32_TABLE = [
|
||||
|
||||
|
||||
def _right_shift_unsigned(num: int, bit: int = 0) -> int:
|
||||
"""JavaScript 无符号右移 (>>>) 的 Python 实现"""
|
||||
"""Python implementation of JavaScript unsigned right shift (>>>)"""
|
||||
val = ctypes.c_uint32(num).value >> bit
|
||||
MAX32INT = 4294967295
|
||||
return (val + (MAX32INT + 1)) % (2 * (MAX32INT + 1)) - MAX32INT - 1
|
||||
|
||||
|
||||
def mrc(e: str) -> int:
|
||||
"""CRC32 变体,用于 x-s-common 的 x9 字段"""
|
||||
"""CRC32 variant, used for x9 field in x-s-common"""
|
||||
o = -1
|
||||
for n in range(min(57, len(e))):
|
||||
o = CRC32_TABLE[(o & 255) ^ ord(e[n])] ^ _right_shift_unsigned(o, 8)
|
||||
@@ -92,7 +92,7 @@ def mrc(e: str) -> int:
|
||||
|
||||
|
||||
def _triplet_to_base64(e: int) -> str:
|
||||
"""将 24 位整数转换为 4 个 Base64 字符"""
|
||||
"""Convert 24-bit integer to 4 Base64 characters"""
|
||||
return (
|
||||
BASE64_CHARS[(e >> 18) & 63]
|
||||
+ BASE64_CHARS[(e >> 12) & 63]
|
||||
@@ -102,7 +102,7 @@ def _triplet_to_base64(e: int) -> str:
|
||||
|
||||
|
||||
def _encode_chunk(data: list, start: int, end: int) -> str:
|
||||
"""编码数据块"""
|
||||
"""Encode data chunk"""
|
||||
result = []
|
||||
for i in range(start, end, 3):
|
||||
c = ((data[i] << 16) & 0xFF0000) + ((data[i + 1] << 8) & 0xFF00) + (data[i + 2] & 0xFF)
|
||||
@@ -111,7 +111,7 @@ def _encode_chunk(data: list, start: int, end: int) -> str:
|
||||
|
||||
|
||||
def encode_utf8(s: str) -> list:
|
||||
"""将字符串编码为 UTF-8 字节列表"""
|
||||
"""Encode string to UTF-8 byte list"""
|
||||
encoded = quote(s, safe="~()*!.'")
|
||||
result = []
|
||||
i = 0
|
||||
@@ -126,7 +126,7 @@ def encode_utf8(s: str) -> list:
|
||||
|
||||
|
||||
def b64_encode(data: list) -> str:
|
||||
"""自定义 Base64 编码"""
|
||||
"""Custom Base64 encoding"""
|
||||
length = len(data)
|
||||
remainder = length % 3
|
||||
chunks = []
|
||||
@@ -148,5 +148,5 @@ def b64_encode(data: list) -> str:
|
||||
|
||||
|
||||
def get_trace_id() -> str:
|
||||
"""生成链路追踪 trace id"""
|
||||
"""Generate trace id for link tracing"""
|
||||
return "".join(random.choice("abcdef0123456789") for _ in range(16))
|
||||
|
||||
@@ -60,14 +60,14 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.default_headers = headers
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = ZhihuExtractor()
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def _pre_headers(self, url: str) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
Sign request headers
|
||||
Args:
|
||||
url: 请求的URL需要包含请求的参数
|
||||
url: Request URL with query parameters
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -83,16 +83,16 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Wrapper for httpx common request method with response handling
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
method: Request method
|
||||
url: Request URL
|
||||
**kwargs: Other request parameters such as headers, body, etc.
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 每次请求前检测代理是否过期
|
||||
# Check if proxy is expired before each request
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
# return response.text
|
||||
@@ -105,7 +105,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}")
|
||||
if response.status_code == 403:
|
||||
raise ForbiddenError(response.text)
|
||||
elif response.status_code == 404: # 如果一个content没有评论也是404
|
||||
elif response.status_code == 404: # Content without comments also returns 404
|
||||
return {}
|
||||
|
||||
raise DataFetchError(response.text)
|
||||
@@ -124,10 +124,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
GET request with header signing
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
uri: Request URI
|
||||
params: Request parameters
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -141,7 +141,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Check if login status is still valid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -161,9 +161,9 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Update cookies method provided by API client, typically called after successful login
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
browser_context: Browser context object
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -174,7 +174,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_current_user_info(self) -> Dict:
|
||||
"""
|
||||
获取当前登录用户信息
|
||||
Get current logged-in user information
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -191,14 +191,14 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
search_time: SearchTime = SearchTime.DEFAULT,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
根据关键词搜索
|
||||
Search by keyword
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 第几页
|
||||
page_size: 分页size
|
||||
sort: 排序
|
||||
note_type: 搜索结果类型
|
||||
search_time: 搜索多久时间的结果
|
||||
keyword: Search keyword
|
||||
page: Page number
|
||||
page_size: Page size
|
||||
sort: Sorting method
|
||||
note_type: Search result type
|
||||
search_time: Time range for search results
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -232,10 +232,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
order_by: str = "score",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取内容的一级评论
|
||||
Get root-level comments for content
|
||||
Args:
|
||||
content_id: 内容ID
|
||||
content_type: 内容类型(answer, article, zvideo)
|
||||
content_id: Content ID
|
||||
content_type: Content type (answer, article, zvideo)
|
||||
offset:
|
||||
limit:
|
||||
order_by:
|
||||
@@ -262,7 +262,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
order_by: str = "sort",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论下的子评论
|
||||
Get child comments under a root comment
|
||||
Args:
|
||||
root_comment_id:
|
||||
offset:
|
||||
@@ -287,11 +287,11 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Get all root-level comments for a specified post, this method will retrieve all comment information under a post
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
content: Content detail object (question|article|video)
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback after completing one crawl
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -328,12 +328,12 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Get all sub-comments under specified comments
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
content: Content detail object (question|article|video)
|
||||
comments: Comment list
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback after completing one crawl
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -370,7 +370,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]:
|
||||
"""
|
||||
获取创作者信息
|
||||
Get creator information
|
||||
Args:
|
||||
url_token:
|
||||
|
||||
@@ -383,7 +383,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的回答
|
||||
Get creator's answers
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
@@ -405,7 +405,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的文章
|
||||
Get creator's articles
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
@@ -426,7 +426,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的视频
|
||||
Get creator's videos
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
@@ -446,11 +446,11 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有回答
|
||||
Get all answers by creator
|
||||
Args:
|
||||
creator: 创作者信息
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
creator: Creator information
|
||||
crawl_interval: Crawl delay interval in seconds
|
||||
callback: Callback after completing one crawl
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -481,7 +481,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有文章
|
||||
Get all articles by creator
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
@@ -515,7 +515,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有视频
|
||||
Get all videos by creator
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
@@ -548,7 +548,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
answer_id: str,
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取回答信息
|
||||
Get answer information
|
||||
Args:
|
||||
question_id:
|
||||
answer_id:
|
||||
@@ -562,7 +562,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取文章信息
|
||||
Get article information
|
||||
Args:
|
||||
article_id:
|
||||
|
||||
@@ -575,7 +575,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取视频信息
|
||||
Get video information
|
||||
Args:
|
||||
video_id:
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||||
self._extractor = ZhihuExtractor()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
@@ -80,9 +80,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
# Choose launch mode based on configuration
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[ZhihuCrawler] 使用CDP模式启动浏览器")
|
||||
utils.logger.info("[ZhihuCrawler] Launching browser in CDP mode")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
@@ -90,7 +90,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[ZhihuCrawler] 使用标准模式启动浏览器")
|
||||
utils.logger.info("[ZhihuCrawler] Launching browser in standard mode")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
@@ -117,9 +117,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
|
||||
# Zhihu's search API requires opening the search page first to access cookies, homepage alone won't work
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
|
||||
"[ZhihuCrawler.start] Zhihu navigating to search page to get search page cookies, this process takes about 5 seconds"
|
||||
)
|
||||
await self.context_page.goto(
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
@@ -273,7 +273,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
)
|
||||
await zhihu_store.save_creator(creator=createor_info)
|
||||
|
||||
# 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可
|
||||
# By default, only answer information is extracted, uncomment below if articles and videos are needed
|
||||
|
||||
# Get all anwser information of the creator
|
||||
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
|
||||
@@ -315,7 +315,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
|
||||
)
|
||||
# judge note type
|
||||
# Judge note type
|
||||
note_type: str = judge_zhihu_url(full_note_url)
|
||||
if note_type == constant.ANSWER_NAME:
|
||||
question_id = full_note_url.split("/")[-3]
|
||||
@@ -412,7 +412,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||||
)
|
||||
return zhihu_client_obj
|
||||
|
||||
@@ -440,7 +440,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
channel="chrome", # 使用系统的Chrome稳定版
|
||||
channel="chrome", # Use system Chrome stable version
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
@@ -458,7 +458,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
Launch browser using CDP mode
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
@@ -469,15 +469,15 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
# Display browser information
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[ZhihuCrawler] CDP浏览器信息: {browser_info}")
|
||||
utils.logger.info(f"[ZhihuCrawler] CDP browser info: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
utils.logger.error(f"[ZhihuCrawler] CDP mode launch failed, falling back to standard mode: {e}")
|
||||
# Fall back to standard mode
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
@@ -485,7 +485,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
# Special handling if using CDP mode
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
|
||||
@@ -26,31 +26,31 @@ from constant import zhihu as zhihu_constant
|
||||
|
||||
class SearchTime(Enum):
|
||||
"""
|
||||
搜索时间范围
|
||||
Search time range
|
||||
"""
|
||||
DEFAULT = "" # 不限时间
|
||||
ONE_DAY = "a_day" # 一天内
|
||||
ONE_WEEK = "a_week" # 一周内
|
||||
ONE_MONTH = "a_month" # 一个月内
|
||||
THREE_MONTH = "three_months" # 三个月内
|
||||
HALF_YEAR = "half_a_year" # 半年内
|
||||
ONE_YEAR = "a_year" # 一年内
|
||||
DEFAULT = "" # No time limit
|
||||
ONE_DAY = "a_day" # Within one day
|
||||
ONE_WEEK = "a_week" # Within one week
|
||||
ONE_MONTH = "a_month" # Within one month
|
||||
THREE_MONTH = "three_months" # Within three months
|
||||
HALF_YEAR = "half_a_year" # Within half a year
|
||||
ONE_YEAR = "a_year" # Within one year
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
"""
|
||||
搜索结果类型
|
||||
Search result type
|
||||
"""
|
||||
DEFAULT = "" # 不限类型
|
||||
ANSWER = zhihu_constant.ANSWER_NAME # 只看回答
|
||||
ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章
|
||||
VIDEO = zhihu_constant.VIDEO_NAME # 只看视频
|
||||
DEFAULT = "" # No type limit
|
||||
ANSWER = zhihu_constant.ANSWER_NAME # Answers only
|
||||
ARTICLE = zhihu_constant.ARTICLE_NAME # Articles only
|
||||
VIDEO = zhihu_constant.VIDEO_NAME # Videos only
|
||||
|
||||
|
||||
class SearchSort(Enum):
|
||||
"""
|
||||
搜索结果排序
|
||||
Search result sorting
|
||||
"""
|
||||
DEFAULT = "" # 综合排序
|
||||
UPVOTED_COUNT = "upvoted_count" # 最多赞同
|
||||
CREATE_TIME = "created_time" # 最新发布
|
||||
DEFAULT = "" # Default sorting
|
||||
UPVOTED_COUNT = "upvoted_count" # Most upvoted
|
||||
CREATE_TIME = "created_time" # Latest published
|
||||
|
||||
@@ -168,7 +168,7 @@ class ZhihuExtractor:
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # This indicates data from the creator's homepage video list API
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
||||
res.created_time = zvideo.get("published_at")
|
||||
res.updated_time = zvideo.get("updated_at")
|
||||
@@ -318,11 +318,11 @@ class ZhihuExtractor:
|
||||
|
||||
"""
|
||||
if gender == 1:
|
||||
return "男"
|
||||
return "Male"
|
||||
elif gender == 0:
|
||||
return "女"
|
||||
return "Female"
|
||||
else:
|
||||
return "未知"
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]:
|
||||
|
||||
@@ -26,55 +26,55 @@ from pydantic import BaseModel, Field
|
||||
|
||||
class TiebaNote(BaseModel):
|
||||
"""
|
||||
百度贴吧帖子
|
||||
Baidu Tieba post
|
||||
"""
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
title: str = Field(..., description="帖子标题")
|
||||
desc: str = Field(default="", description="帖子描述")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
publish_time: str = Field(default="", description="发布时间")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
tieba_name: str = Field(..., description="贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
total_replay_num: int = Field(default=0, description="回复总数")
|
||||
total_replay_page: int = Field(default=0, description="回复总页数")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
source_keyword: str = Field(default="", description="来源关键词")
|
||||
note_id: str = Field(..., description="Post ID")
|
||||
title: str = Field(..., description="Post title")
|
||||
desc: str = Field(default="", description="Post description")
|
||||
note_url: str = Field(..., description="Post link")
|
||||
publish_time: str = Field(default="", description="Publish time")
|
||||
user_link: str = Field(default="", description="User homepage link")
|
||||
user_nickname: str = Field(default="", description="User nickname")
|
||||
user_avatar: str = Field(default="", description="User avatar URL")
|
||||
tieba_name: str = Field(..., description="Tieba name")
|
||||
tieba_link: str = Field(..., description="Tieba link")
|
||||
total_replay_num: int = Field(default=0, description="Total reply count")
|
||||
total_replay_page: int = Field(default=0, description="Total reply pages")
|
||||
ip_location: Optional[str] = Field(default="", description="IP location")
|
||||
source_keyword: str = Field(default="", description="Source keyword")
|
||||
|
||||
|
||||
class TiebaComment(BaseModel):
|
||||
"""
|
||||
百度贴吧评论
|
||||
Baidu Tieba comment
|
||||
"""
|
||||
|
||||
comment_id: str = Field(..., description="评论ID")
|
||||
parent_comment_id: str = Field(default="", description="父评论ID")
|
||||
content: str = Field(..., description="评论内容")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
publish_time: str = Field(default="", description="发布时间")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
sub_comment_count: int = Field(default=0, description="子评论数")
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
tieba_id: str = Field(..., description="所属的贴吧ID")
|
||||
tieba_name: str = Field(..., description="所属的贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
comment_id: str = Field(..., description="Comment ID")
|
||||
parent_comment_id: str = Field(default="", description="Parent comment ID")
|
||||
content: str = Field(..., description="Comment content")
|
||||
user_link: str = Field(default="", description="User homepage link")
|
||||
user_nickname: str = Field(default="", description="User nickname")
|
||||
user_avatar: str = Field(default="", description="User avatar URL")
|
||||
publish_time: str = Field(default="", description="Publish time")
|
||||
ip_location: Optional[str] = Field(default="", description="IP location")
|
||||
sub_comment_count: int = Field(default=0, description="Sub-comment count")
|
||||
note_id: str = Field(..., description="Post ID")
|
||||
note_url: str = Field(..., description="Post link")
|
||||
tieba_id: str = Field(..., description="Tieba ID")
|
||||
tieba_name: str = Field(..., description="Tieba name")
|
||||
tieba_link: str = Field(..., description="Tieba link")
|
||||
|
||||
|
||||
class TiebaCreator(BaseModel):
|
||||
"""
|
||||
百度贴吧创作者
|
||||
Baidu Tieba creator
|
||||
"""
|
||||
user_id: str = Field(..., description="用户ID")
|
||||
user_name: str = Field(..., description="用户名")
|
||||
nickname: str = Field(..., description="用户昵称")
|
||||
gender: str = Field(default="", description="用户性别")
|
||||
avatar: str = Field(..., description="用户头像地址")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
follows: int = Field(default=0, description="关注数")
|
||||
fans: int = Field(default=0, description="粉丝数")
|
||||
registration_duration: str = Field(default="", description="注册时长")
|
||||
user_id: str = Field(..., description="User ID")
|
||||
user_name: str = Field(..., description="Username")
|
||||
nickname: str = Field(..., description="User nickname")
|
||||
gender: str = Field(default="", description="User gender")
|
||||
avatar: str = Field(..., description="User avatar URL")
|
||||
ip_location: Optional[str] = Field(default="", description="IP location")
|
||||
follows: int = Field(default=0, description="Follows count")
|
||||
fans: int = Field(default=0, description="Fans count")
|
||||
registration_duration: str = Field(default="", description="Registration duration")
|
||||
|
||||
@@ -33,11 +33,11 @@ from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class VideoUrlInfo(BaseModel):
|
||||
"""B站视频URL信息"""
|
||||
"""Bilibili video URL information"""
|
||||
video_id: str = Field(title="video id (BV id)")
|
||||
video_type: str = Field(default="video", title="video type")
|
||||
|
||||
|
||||
class CreatorUrlInfo(BaseModel):
|
||||
"""B站创作者URL信息"""
|
||||
"""Bilibili creator URL information"""
|
||||
creator_id: str = Field(title="creator id (UID)")
|
||||
|
||||
@@ -24,11 +24,11 @@ from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class VideoUrlInfo(BaseModel):
|
||||
"""抖音视频URL信息"""
|
||||
"""Douyin video URL information"""
|
||||
aweme_id: str = Field(title="aweme id (video id)")
|
||||
url_type: str = Field(default="normal", title="url type: normal, short, modal")
|
||||
|
||||
|
||||
class CreatorUrlInfo(BaseModel):
|
||||
"""抖音创作者URL信息"""
|
||||
"""Douyin creator URL information"""
|
||||
sec_user_id: str = Field(title="sec_user_id (creator id)")
|
||||
|
||||
@@ -24,11 +24,11 @@ from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class VideoUrlInfo(BaseModel):
|
||||
"""快手视频URL信息"""
|
||||
"""Kuaishou video URL information"""
|
||||
video_id: str = Field(title="video id (photo id)")
|
||||
url_type: str = Field(default="normal", title="url type: normal")
|
||||
|
||||
|
||||
class CreatorUrlInfo(BaseModel):
|
||||
"""快手创作者URL信息"""
|
||||
"""Kuaishou creator URL information"""
|
||||
user_id: str = Field(title="user id (creator id)")
|
||||
|
||||
@@ -31,7 +31,7 @@ class NoteUrlInfo(BaseModel):
|
||||
|
||||
|
||||
class CreatorUrlInfo(BaseModel):
|
||||
"""小红书创作者URL信息"""
|
||||
"""Xiaohongshu creator URL information"""
|
||||
user_id: str = Field(title="user id (creator id)")
|
||||
xsec_token: str = Field(default="", title="xsec token")
|
||||
xsec_source: str = Field(default="", title="xsec source")
|
||||
|
||||
@@ -26,66 +26,66 @@ from pydantic import BaseModel, Field
|
||||
|
||||
class ZhihuContent(BaseModel):
|
||||
"""
|
||||
知乎内容(回答、文章、视频)
|
||||
Zhihu content (answer, article, video)
|
||||
"""
|
||||
content_id: str = Field(default="", description="内容ID")
|
||||
content_type: str = Field(default="", description="内容类型(article | answer | zvideo)")
|
||||
content_text: str = Field(default="", description="内容文本, 如果是视频类型这里为空")
|
||||
content_url: str = Field(default="", description="内容落地链接")
|
||||
question_id: str = Field(default="", description="问题ID, type为answer时有值")
|
||||
title: str = Field(default="", description="内容标题")
|
||||
desc: str = Field(default="", description="内容描述")
|
||||
created_time: int = Field(default=0, description="创建时间")
|
||||
updated_time: int = Field(default=0, description="更新时间")
|
||||
voteup_count: int = Field(default=0, description="赞同人数")
|
||||
comment_count: int = Field(default=0, description="评论数量")
|
||||
source_keyword: str = Field(default="", description="来源关键词")
|
||||
content_id: str = Field(default="", description="Content ID")
|
||||
content_type: str = Field(default="", description="Content type (article | answer | zvideo)")
|
||||
content_text: str = Field(default="", description="Content text, empty for video type")
|
||||
content_url: str = Field(default="", description="Content landing page URL")
|
||||
question_id: str = Field(default="", description="Question ID, has value when type is answer")
|
||||
title: str = Field(default="", description="Content title")
|
||||
desc: str = Field(default="", description="Content description")
|
||||
created_time: int = Field(default=0, description="Create time")
|
||||
updated_time: int = Field(default=0, description="Update time")
|
||||
voteup_count: int = Field(default=0, description="Upvote count")
|
||||
comment_count: int = Field(default=0, description="Comment count")
|
||||
source_keyword: str = Field(default="", description="Source keyword")
|
||||
|
||||
user_id: str = Field(default="", description="用户ID")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
user_url_token: str = Field(default="", description="用户url_token")
|
||||
user_id: str = Field(default="", description="User ID")
|
||||
user_link: str = Field(default="", description="User homepage link")
|
||||
user_nickname: str = Field(default="", description="User nickname")
|
||||
user_avatar: str = Field(default="", description="User avatar URL")
|
||||
user_url_token: str = Field(default="", description="User url_token")
|
||||
|
||||
|
||||
class ZhihuComment(BaseModel):
|
||||
"""
|
||||
知乎评论
|
||||
Zhihu comment
|
||||
"""
|
||||
|
||||
comment_id: str = Field(default="", description="评论ID")
|
||||
parent_comment_id: str = Field(default="", description="父评论ID")
|
||||
content: str = Field(default="", description="评论内容")
|
||||
publish_time: int = Field(default=0, description="发布时间")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
sub_comment_count: int = Field(default=0, description="子评论数")
|
||||
like_count: int = Field(default=0, description="点赞数")
|
||||
dislike_count: int = Field(default=0, description="踩数")
|
||||
content_id: str = Field(default="", description="内容ID")
|
||||
content_type: str = Field(default="", description="内容类型(article | answer | zvideo)")
|
||||
comment_id: str = Field(default="", description="Comment ID")
|
||||
parent_comment_id: str = Field(default="", description="Parent comment ID")
|
||||
content: str = Field(default="", description="Comment content")
|
||||
publish_time: int = Field(default=0, description="Publish time")
|
||||
ip_location: Optional[str] = Field(default="", description="IP location")
|
||||
sub_comment_count: int = Field(default=0, description="Sub-comment count")
|
||||
like_count: int = Field(default=0, description="Like count")
|
||||
dislike_count: int = Field(default=0, description="Dislike count")
|
||||
content_id: str = Field(default="", description="Content ID")
|
||||
content_type: str = Field(default="", description="Content type (article | answer | zvideo)")
|
||||
|
||||
user_id: str = Field(default="", description="用户ID")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
user_id: str = Field(default="", description="User ID")
|
||||
user_link: str = Field(default="", description="User homepage link")
|
||||
user_nickname: str = Field(default="", description="User nickname")
|
||||
user_avatar: str = Field(default="", description="User avatar URL")
|
||||
|
||||
|
||||
class ZhihuCreator(BaseModel):
|
||||
"""
|
||||
知乎创作者
|
||||
Zhihu creator
|
||||
"""
|
||||
user_id: str = Field(default="", description="用户ID")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
url_token: str = Field(default="", description="用户url_token")
|
||||
gender: str = Field(default="", description="用户性别")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
follows: int = Field(default=0, description="关注数")
|
||||
fans: int = Field(default=0, description="粉丝数")
|
||||
anwser_count: int = Field(default=0, description="回答数")
|
||||
video_count: int = Field(default=0, description="视频数")
|
||||
question_count: int = Field(default=0, description="提问数")
|
||||
article_count: int = Field(default=0, description="文章数")
|
||||
column_count: int = Field(default=0, description="专栏数")
|
||||
get_voteup_count: int = Field(default=0, description="获得的赞同数")
|
||||
user_id: str = Field(default="", description="User ID")
|
||||
user_link: str = Field(default="", description="User homepage link")
|
||||
user_nickname: str = Field(default="", description="User nickname")
|
||||
user_avatar: str = Field(default="", description="User avatar URL")
|
||||
url_token: str = Field(default="", description="User url_token")
|
||||
gender: str = Field(default="", description="User gender")
|
||||
ip_location: Optional[str] = Field(default="", description="IP location")
|
||||
follows: int = Field(default=0, description="Follows count")
|
||||
fans: int = Field(default=0, description="Fans count")
|
||||
anwser_count: int = Field(default=0, description="Answer count")
|
||||
video_count: int = Field(default=0, description="Video count")
|
||||
question_count: int = Field(default=0, description="Question count")
|
||||
article_count: int = Field(default=0, description="Article count")
|
||||
column_count: int = Field(default=0, description="Column count")
|
||||
get_voteup_count: int = Field(default=0, description="Total upvotes received")
|
||||
|
||||
@@ -21,5 +21,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 14:37
|
||||
# @Desc : IP代理池入口
|
||||
# @Desc : IP proxy pool entry point
|
||||
from .base_proxy import *
|
||||
|
||||
@@ -21,8 +21,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 11:18
|
||||
# @Desc : 爬虫 IP 获取实现
|
||||
# @Url : 快代理HTTP实现,官方文档:https://www.kuaidaili.com/?ref=ldwkjqipvz6c
|
||||
# @Desc : Crawler IP acquisition implementation
|
||||
# @Url : KuaiDaili HTTP implementation, official documentation: https://www.kuaidaili.com/?ref=ldwkjqipvz6c
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
@@ -43,8 +43,8 @@ class ProxyProvider(ABC):
|
||||
@abstractmethod
|
||||
async def get_proxy(self, num: int) -> List[IpInfoModel]:
|
||||
"""
|
||||
获取 IP 的抽象方法,不同的 HTTP 代理商需要实现该方法
|
||||
:param num: 提取的 IP 数量
|
||||
Abstract method to get IP, different HTTP proxy providers need to implement this method
|
||||
:param num: Number of IPs to extract
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -57,7 +57,7 @@ class IpCache:
|
||||
|
||||
def set_ip(self, ip_key: str, ip_value_info: str, ex: int):
|
||||
"""
|
||||
设置IP并带有过期时间,到期之后由 redis 负责删除
|
||||
Set IP with expiration time, Redis is responsible for deletion after expiration
|
||||
:param ip_key:
|
||||
:param ip_value_info:
|
||||
:param ex:
|
||||
@@ -67,8 +67,8 @@ class IpCache:
|
||||
|
||||
def load_all_ip(self, proxy_brand_name: str) -> List[IpInfoModel]:
|
||||
"""
|
||||
从 redis 中加载所有还未过期的 IP 信息
|
||||
:param proxy_brand_name: 代理商名称
|
||||
Load all unexpired IP information from Redis
|
||||
:param proxy_brand_name: Proxy provider name
|
||||
:return:
|
||||
"""
|
||||
all_ip_list: List[IpInfoModel] = []
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/5 09:32
|
||||
# @Desc : 已废弃!!!!!倒闭了!!!极速HTTP 代理IP实现. 请使用快代理实现(proxy/providers/kuaidl_proxy.py)
|
||||
# @Desc : Deprecated!!!!! Shut down!!! JiSu HTTP proxy IP implementation. Please use KuaiDaili implementation (proxy/providers/kuaidl_proxy.py)
|
||||
import os
|
||||
from typing import Dict, List
|
||||
from urllib.parse import urlencode
|
||||
@@ -36,20 +36,20 @@ class JiSuHttpProxy(ProxyProvider):
|
||||
|
||||
def __init__(self, key: str, crypto: str, time_validity_period: int):
|
||||
"""
|
||||
极速HTTP 代理IP实现
|
||||
:param key: 提取key值 (去官网注册后获取)
|
||||
:param crypto: 加密签名 (去官网注册后获取)
|
||||
JiSu HTTP proxy IP implementation
|
||||
:param key: Extraction key value (obtain after registering on the official website)
|
||||
:param crypto: Encryption signature (obtain after registering on the official website)
|
||||
"""
|
||||
self.proxy_brand_name = "JISUHTTP"
|
||||
self.api_path = "https://api.jisuhttp.com"
|
||||
self.params = {
|
||||
"key": key,
|
||||
"crypto": crypto,
|
||||
"time": time_validity_period, # IP使用时长,支持3、5、10、15、30分钟时效
|
||||
"type": "json", # 数据结果为json
|
||||
"port": "2", # IP协议:1:HTTP、2:HTTPS、3:SOCKS5
|
||||
"pw": "1", # 是否使用账密验证, 1:是,0:否,否表示白名单验证;默认为0
|
||||
"se": "1", # 返回JSON格式时是否显示IP过期时间, 1:显示,0:不显示;默认为0
|
||||
"time": time_validity_period, # IP usage duration, supports 3, 5, 10, 15, 30 minute validity
|
||||
"type": "json", # Data result is json
|
||||
"port": "2", # IP protocol: 1:HTTP, 2:HTTPS, 3:SOCKS5
|
||||
"pw": "1", # Whether to use account password authentication, 1: yes, 0: no, no means whitelist authentication; default is 0
|
||||
"se": "1", # Whether to show IP expiration time when returning JSON format, 1: show, 0: don't show; default is 0
|
||||
}
|
||||
self.ip_cache = IpCache()
|
||||
|
||||
@@ -59,12 +59,12 @@ class JiSuHttpProxy(ProxyProvider):
|
||||
:return:
|
||||
"""
|
||||
|
||||
# 优先从缓存中拿 IP
|
||||
# Prioritize getting IP from cache
|
||||
ip_cache_list = self.ip_cache.load_all_ip(proxy_brand_name=self.proxy_brand_name)
|
||||
if len(ip_cache_list) >= num:
|
||||
return ip_cache_list[:num]
|
||||
|
||||
# 如果缓存中的数量不够,从IP代理商获取补上,再存入缓存中
|
||||
# If the quantity in cache is insufficient, get from IP provider to supplement, then store in cache
|
||||
need_get_count = num - len(ip_cache_list)
|
||||
self.params.update({"num": need_get_count})
|
||||
ip_infos = []
|
||||
@@ -97,12 +97,12 @@ class JiSuHttpProxy(ProxyProvider):
|
||||
|
||||
def new_jisu_http_proxy() -> JiSuHttpProxy:
|
||||
"""
|
||||
构造极速HTTP实例
|
||||
Construct JiSu HTTP instance
|
||||
Returns:
|
||||
|
||||
"""
|
||||
return JiSuHttpProxy(
|
||||
key=os.getenv("jisu_key", ""), # 通过环境变量的方式获取极速HTTPIP提取key值
|
||||
crypto=os.getenv("jisu_crypto", ""), # 通过环境变量的方式获取极速HTTPIP提取加密签名
|
||||
time_validity_period=30 # 30分钟(最长时效)
|
||||
key=os.getenv("jisu_key", ""), # Get JiSu HTTP IP extraction key value through environment variable
|
||||
crypto=os.getenv("jisu_crypto", ""), # Get JiSu HTTP IP extraction encryption signature through environment variable
|
||||
time_validity_period=30 # 30 minutes (maximum validity)
|
||||
)
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/5 09:43
|
||||
# @Desc : 快代理HTTP实现,官方文档:https://www.kuaidaili.com/?ref=ldwkjqipvz6c
|
||||
# @Desc : KuaiDaili HTTP implementation, official documentation: https://www.kuaidaili.com/?ref=ldwkjqipvz6c
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List
|
||||
@@ -33,19 +33,19 @@ from proxy import IpCache, IpInfoModel, ProxyProvider
|
||||
from proxy.types import ProviderNameEnum
|
||||
from tools import utils
|
||||
|
||||
# 快代理的IP代理过期时间向前推移5秒,避免临界时间使用失败
|
||||
# KuaiDaili IP proxy expiration time is moved forward by 5 seconds to avoid critical time usage failure
|
||||
DELTA_EXPIRED_SECOND = 5
|
||||
|
||||
|
||||
class KuaidailiProxyModel(BaseModel):
|
||||
ip: str = Field("ip")
|
||||
port: int = Field("端口")
|
||||
expire_ts: int = Field("过期时间,单位秒,多少秒后过期")
|
||||
port: int = Field("port")
|
||||
expire_ts: int = Field("Expiration time, in seconds, how many seconds until expiration")
|
||||
|
||||
|
||||
def parse_kuaidaili_proxy(proxy_info: str) -> KuaidailiProxyModel:
|
||||
"""
|
||||
解析快代理的IP信息
|
||||
Parse KuaiDaili IP information
|
||||
Args:
|
||||
proxy_info:
|
||||
|
||||
@@ -94,7 +94,7 @@ class KuaiDaiLiProxy(ProxyProvider):
|
||||
|
||||
async def get_proxy(self, num: int) -> List[IpInfoModel]:
|
||||
"""
|
||||
快代理实现
|
||||
KuaiDaili implementation
|
||||
Args:
|
||||
num:
|
||||
|
||||
@@ -103,12 +103,12 @@ class KuaiDaiLiProxy(ProxyProvider):
|
||||
"""
|
||||
uri = "/api/getdps/"
|
||||
|
||||
# 优先从缓存中拿 IP
|
||||
# Prioritize getting IP from cache
|
||||
ip_cache_list = self.ip_cache.load_all_ip(proxy_brand_name=self.proxy_brand_name)
|
||||
if len(ip_cache_list) >= num:
|
||||
return ip_cache_list[:num]
|
||||
|
||||
# 如果缓存中的数量不够,从IP代理商获取补上,再存入缓存中
|
||||
# If the quantity in cache is insufficient, get from IP provider to supplement, then store in cache
|
||||
need_get_count = num - len(ip_cache_list)
|
||||
self.params.update({"num": need_get_count})
|
||||
|
||||
@@ -128,8 +128,8 @@ class KuaiDaiLiProxy(ProxyProvider):
|
||||
proxy_list: List[str] = ip_response.get("data", {}).get("proxy_list")
|
||||
for proxy in proxy_list:
|
||||
proxy_model = parse_kuaidaili_proxy(proxy)
|
||||
# expire_ts是相对时间(秒数),需要转换为绝对时间戳
|
||||
# 提前DELTA_EXPIRED_SECOND秒认为过期,避免临界时间使用失败
|
||||
# expire_ts is relative time (seconds), needs to be converted to absolute timestamp
|
||||
# Consider expired DELTA_EXPIRED_SECOND seconds in advance to avoid critical time usage failure
|
||||
ip_info_model = IpInfoModel(
|
||||
ip=proxy_model.ip,
|
||||
port=proxy_model.port,
|
||||
@@ -139,7 +139,7 @@ class KuaiDaiLiProxy(ProxyProvider):
|
||||
|
||||
)
|
||||
ip_key = f"{self.proxy_brand_name}_{ip_info_model.ip}_{ip_info_model.port}"
|
||||
# 缓存过期时间使用相对时间(秒数),也需要减去缓冲时间
|
||||
# Cache expiration time uses relative time (seconds), also needs to subtract buffer time
|
||||
self.ip_cache.set_ip(ip_key, ip_info_model.model_dump_json(), ex=proxy_model.expire_ts - DELTA_EXPIRED_SECOND)
|
||||
ip_infos.append(ip_info_model)
|
||||
|
||||
@@ -148,19 +148,19 @@ class KuaiDaiLiProxy(ProxyProvider):
|
||||
|
||||
def new_kuai_daili_proxy() -> KuaiDaiLiProxy:
|
||||
"""
|
||||
构造快代理HTTP实例
|
||||
支持两种环境变量命名格式:
|
||||
1. 大写格式:KDL_SECERT_ID, KDL_SIGNATURE, KDL_USER_NAME, KDL_USER_PWD
|
||||
2. 小写格式:kdl_secret_id, kdl_signature, kdl_user_name, kdl_user_pwd
|
||||
优先使用大写格式,如果不存在则使用小写格式
|
||||
Construct KuaiDaili HTTP instance
|
||||
Supports two environment variable naming formats:
|
||||
1. Uppercase format: KDL_SECERT_ID, KDL_SIGNATURE, KDL_USER_NAME, KDL_USER_PWD
|
||||
2. Lowercase format: kdl_secret_id, kdl_signature, kdl_user_name, kdl_user_pwd
|
||||
Prioritize uppercase format, use lowercase format if not exists
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 支持大小写两种环境变量格式,优先使用大写
|
||||
kdl_secret_id = os.getenv("KDL_SECERT_ID") or os.getenv("kdl_secret_id", "你的快代理secert_id")
|
||||
kdl_signature = os.getenv("KDL_SIGNATURE") or os.getenv("kdl_signature", "你的快代理签名")
|
||||
kdl_user_name = os.getenv("KDL_USER_NAME") or os.getenv("kdl_user_name", "你的快代理用户名")
|
||||
kdl_user_pwd = os.getenv("KDL_USER_PWD") or os.getenv("kdl_user_pwd", "你的快代理密码")
|
||||
# Support both uppercase and lowercase environment variable formats, prioritize uppercase
|
||||
kdl_secret_id = os.getenv("KDL_SECERT_ID") or os.getenv("kdl_secret_id", "your_kuaidaili_secret_id")
|
||||
kdl_signature = os.getenv("KDL_SIGNATURE") or os.getenv("kdl_signature", "your_kuaidaili_signature")
|
||||
kdl_user_name = os.getenv("KDL_USER_NAME") or os.getenv("kdl_user_name", "your_kuaidaili_username")
|
||||
kdl_user_pwd = os.getenv("KDL_USER_PWD") or os.getenv("kdl_user_pwd", "your_kuaidaili_password")
|
||||
|
||||
return KuaiDaiLiProxy(
|
||||
kdl_secret_id=kdl_secret_id,
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2025/7/31
|
||||
# @Desc : 豌豆HTTP 代理IP实现
|
||||
# @Desc : WanDou HTTP proxy IP implementation
|
||||
import os
|
||||
from typing import Dict, List
|
||||
from urllib.parse import urlencode
|
||||
@@ -36,9 +36,9 @@ class WanDouHttpProxy(ProxyProvider):
|
||||
|
||||
def __init__(self, app_key: str, num: int = 100):
|
||||
"""
|
||||
豌豆HTTP 代理IP实现
|
||||
:param app_key: 开放的app_key,可以通过用户中心获取
|
||||
:param num: 单次提取IP数量,最大100
|
||||
WanDou HTTP proxy IP implementation
|
||||
:param app_key: Open app_key, can be obtained through user center
|
||||
:param num: Number of IPs extracted at once, maximum 100
|
||||
"""
|
||||
self.proxy_brand_name = "WANDOUHTTP"
|
||||
self.api_path = "https://api.wandouapp.com/"
|
||||
@@ -54,16 +54,16 @@ class WanDouHttpProxy(ProxyProvider):
|
||||
:return:
|
||||
"""
|
||||
|
||||
# 优先从缓存中拿 IP
|
||||
# Prioritize getting IP from cache
|
||||
ip_cache_list = self.ip_cache.load_all_ip(
|
||||
proxy_brand_name=self.proxy_brand_name
|
||||
)
|
||||
if len(ip_cache_list) >= num:
|
||||
return ip_cache_list[:num]
|
||||
|
||||
# 如果缓存中的数量不够,从IP代理商获取补上,再存入缓存中
|
||||
# If the quantity in cache is insufficient, get from IP provider to supplement, then store in cache
|
||||
need_get_count = num - len(ip_cache_list)
|
||||
self.params.update({"num": min(need_get_count, 100)}) # 最大100
|
||||
self.params.update({"num": min(need_get_count, 100)}) # Maximum 100
|
||||
ip_infos = []
|
||||
async with httpx.AsyncClient() as client:
|
||||
url = self.api_path + "?" + urlencode(self.params)
|
||||
@@ -82,7 +82,7 @@ class WanDouHttpProxy(ProxyProvider):
|
||||
ip_info_model = IpInfoModel(
|
||||
ip=ip_item.get("ip"),
|
||||
port=ip_item.get("port"),
|
||||
user="", # 豌豆HTTP不需要用户名密码认证
|
||||
user="", # WanDou HTTP does not require username password authentication
|
||||
password="",
|
||||
expired_time_ts=utils.get_unix_time_from_time_str(
|
||||
ip_item.get("expire_time")
|
||||
@@ -96,27 +96,27 @@ class WanDouHttpProxy(ProxyProvider):
|
||||
)
|
||||
else:
|
||||
error_msg = res_dict.get("msg", "unknown error")
|
||||
# 处理具体错误码
|
||||
# Handle specific error codes
|
||||
error_code = res_dict.get("code")
|
||||
if error_code == 10001:
|
||||
error_msg = "通用错误,具体错误信息查看msg内容"
|
||||
error_msg = "General error, check msg content for specific error information"
|
||||
elif error_code == 10048:
|
||||
error_msg = "没有可用套餐"
|
||||
error_msg = "No available package"
|
||||
raise IpGetError(f"{error_msg} (code: {error_code})")
|
||||
return ip_cache_list + ip_infos
|
||||
|
||||
|
||||
def new_wandou_http_proxy() -> WanDouHttpProxy:
|
||||
"""
|
||||
构造豌豆HTTP实例
|
||||
支持两种环境变量命名格式:
|
||||
1. 大写格式:WANDOU_APP_KEY
|
||||
2. 小写格式:wandou_app_key
|
||||
优先使用大写格式,如果不存在则使用小写格式
|
||||
Construct WanDou HTTP instance
|
||||
Supports two environment variable naming formats:
|
||||
1. Uppercase format: WANDOU_APP_KEY
|
||||
2. Lowercase format: wandou_app_key
|
||||
Prioritize uppercase format, use lowercase format if not exists
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 支持大小写两种环境变量格式,优先使用大写
|
||||
app_key = os.getenv("WANDOU_APP_KEY") or os.getenv("wandou_app_key", "你的豌豆HTTP app_key")
|
||||
# Support both uppercase and lowercase environment variable formats, prioritize uppercase
|
||||
app_key = os.getenv("WANDOU_APP_KEY") or os.getenv("wandou_app_key", "your_wandou_http_app_key")
|
||||
|
||||
return WanDouHttpProxy(app_key=app_key)
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 13:45
|
||||
# @Desc : ip代理池实现
|
||||
# @Desc : IP proxy pool implementation
|
||||
import random
|
||||
from typing import Dict, List
|
||||
|
||||
@@ -50,16 +50,16 @@ class ProxyIpPool:
|
||||
enable_validate_ip:
|
||||
ip_provider:
|
||||
"""
|
||||
self.valid_ip_url = "https://echo.apifox.cn/" # 验证 IP 是否有效的地址
|
||||
self.valid_ip_url = "https://echo.apifox.cn/" # URL to validate if IP is valid
|
||||
self.ip_pool_count = ip_pool_count
|
||||
self.enable_validate_ip = enable_validate_ip
|
||||
self.proxy_list: List[IpInfoModel] = []
|
||||
self.ip_provider: ProxyProvider = ip_provider
|
||||
self.current_proxy: IpInfoModel | None = None # 当前正在使用的代理
|
||||
self.current_proxy: IpInfoModel | None = None # Currently used proxy
|
||||
|
||||
async def load_proxies(self) -> None:
|
||||
"""
|
||||
加载IP代理
|
||||
Load IP proxies
|
||||
Returns:
|
||||
|
||||
"""
|
||||
@@ -67,7 +67,7 @@ class ProxyIpPool:
|
||||
|
||||
async def _is_valid_proxy(self, proxy: IpInfoModel) -> bool:
|
||||
"""
|
||||
验证代理IP是否有效
|
||||
Validate if proxy IP is valid
|
||||
:param proxy:
|
||||
:return:
|
||||
"""
|
||||
@@ -75,7 +75,7 @@ class ProxyIpPool:
|
||||
f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} is it valid "
|
||||
)
|
||||
try:
|
||||
# httpx 0.28.1 需要直接传入代理URL字符串,而不是字典
|
||||
# httpx 0.28.1 requires passing proxy URL string directly, not a dictionary
|
||||
if proxy.user and proxy.password:
|
||||
proxy_url = f"http://{proxy.user}:{proxy.password}@{proxy.ip}:{proxy.port}"
|
||||
else:
|
||||
@@ -96,29 +96,29 @@ class ProxyIpPool:
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def get_proxy(self) -> IpInfoModel:
|
||||
"""
|
||||
从代理池中随机提取一个代理IP
|
||||
Randomly extract a proxy IP from the proxy pool
|
||||
:return:
|
||||
"""
|
||||
if len(self.proxy_list) == 0:
|
||||
await self._reload_proxies()
|
||||
|
||||
proxy = random.choice(self.proxy_list)
|
||||
self.proxy_list.remove(proxy) # 取出来一个IP就应该移出掉
|
||||
self.proxy_list.remove(proxy) # Remove an IP once extracted
|
||||
if self.enable_validate_ip:
|
||||
if not await self._is_valid_proxy(proxy):
|
||||
raise Exception(
|
||||
"[ProxyIpPool.get_proxy] current ip invalid and again get it"
|
||||
)
|
||||
self.current_proxy = proxy # 保存当前使用的代理
|
||||
self.current_proxy = proxy # Save currently used proxy
|
||||
return proxy
|
||||
|
||||
def is_current_proxy_expired(self, buffer_seconds: int = 30) -> bool:
|
||||
"""
|
||||
检测当前代理是否已过期
|
||||
Check if current proxy has expired
|
||||
Args:
|
||||
buffer_seconds: 缓冲时间(秒),提前多少秒认为已过期
|
||||
buffer_seconds: Buffer time (seconds), how many seconds ahead to consider expired
|
||||
Returns:
|
||||
bool: True表示已过期或没有当前代理,False表示仍然有效
|
||||
bool: True means expired or no current proxy, False means still valid
|
||||
"""
|
||||
if self.current_proxy is None:
|
||||
return True
|
||||
@@ -126,12 +126,12 @@ class ProxyIpPool:
|
||||
|
||||
async def get_or_refresh_proxy(self, buffer_seconds: int = 30) -> IpInfoModel:
|
||||
"""
|
||||
获取当前代理,如果已过期则自动刷新
|
||||
每次发起请求前调用此方法来确保代理有效
|
||||
Get current proxy, automatically refresh if expired
|
||||
Call this method before each request to ensure proxy is valid
|
||||
Args:
|
||||
buffer_seconds: 缓冲时间(秒),提前多少秒认为已过期
|
||||
buffer_seconds: Buffer time (seconds), how many seconds ahead to consider expired
|
||||
Returns:
|
||||
IpInfoModel: 有效的代理IP信息
|
||||
IpInfoModel: Valid proxy IP information
|
||||
"""
|
||||
if self.is_current_proxy_expired(buffer_seconds):
|
||||
utils.logger.info(
|
||||
@@ -142,7 +142,7 @@ class ProxyIpPool:
|
||||
|
||||
async def _reload_proxies(self):
|
||||
"""
|
||||
# 重新加载代理池
|
||||
Reload proxy pool
|
||||
:return:
|
||||
"""
|
||||
self.proxy_list = []
|
||||
@@ -157,9 +157,9 @@ IpProxyProvider: Dict[str, ProxyProvider] = {
|
||||
|
||||
async def create_ip_pool(ip_pool_count: int, enable_validate_ip: bool) -> ProxyIpPool:
|
||||
"""
|
||||
创建 IP 代理池
|
||||
:param ip_pool_count: ip池子的数量
|
||||
:param enable_validate_ip: 是否开启验证IP代理
|
||||
Create IP proxy pool
|
||||
:param ip_pool_count: Number of IPs in the pool
|
||||
:param enable_validate_ip: Whether to enable IP proxy validation
|
||||
:return:
|
||||
"""
|
||||
pool = ProxyIpPool(
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2025/11/25
|
||||
# @Desc : 代理自动刷新 Mixin 类,供各平台 client 使用
|
||||
# @Desc : Auto-refresh proxy Mixin class for use by various platform clients
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
@@ -33,31 +33,31 @@ if TYPE_CHECKING:
|
||||
|
||||
class ProxyRefreshMixin:
|
||||
"""
|
||||
代理自动刷新 Mixin 类
|
||||
Auto-refresh proxy Mixin class
|
||||
|
||||
使用方法:
|
||||
1. 让 client 类继承此 Mixin
|
||||
2. 在 client 的 __init__ 中调用 init_proxy_pool(proxy_ip_pool)
|
||||
3. 在每次 request 方法调用前调用 await _refresh_proxy_if_expired()
|
||||
Usage:
|
||||
1. Let client class inherit this Mixin
|
||||
2. Call init_proxy_pool(proxy_ip_pool) in client's __init__
|
||||
3. Call await _refresh_proxy_if_expired() before each request method call
|
||||
|
||||
要求:
|
||||
- client 类必须有 self.proxy 属性来存储当前代理URL
|
||||
Requirements:
|
||||
- client class must have self.proxy attribute to store current proxy URL
|
||||
"""
|
||||
|
||||
_proxy_ip_pool: Optional["ProxyIpPool"] = None
|
||||
|
||||
def init_proxy_pool(self, proxy_ip_pool: Optional["ProxyIpPool"]) -> None:
|
||||
"""
|
||||
初始化代理池引用
|
||||
Initialize proxy pool reference
|
||||
Args:
|
||||
proxy_ip_pool: 代理IP池实例
|
||||
proxy_ip_pool: Proxy IP pool instance
|
||||
"""
|
||||
self._proxy_ip_pool = proxy_ip_pool
|
||||
|
||||
async def _refresh_proxy_if_expired(self) -> None:
|
||||
"""
|
||||
检测代理是否过期,如果过期则自动刷新
|
||||
每次发起请求前调用此方法来确保代理有效
|
||||
Check if proxy has expired, automatically refresh if so
|
||||
Call this method before each request to ensure proxy is valid
|
||||
"""
|
||||
if self._proxy_ip_pool is None:
|
||||
return
|
||||
@@ -67,7 +67,7 @@ class ProxyRefreshMixin:
|
||||
f"[{self.__class__.__name__}._refresh_proxy_if_expired] Proxy expired, refreshing..."
|
||||
)
|
||||
new_proxy = await self._proxy_ip_pool.get_or_refresh_proxy()
|
||||
# 更新 httpx 代理URL
|
||||
# Update httpx proxy URL
|
||||
if new_proxy.user and new_proxy.password:
|
||||
self.proxy = f"http://{new_proxy.user}:{new_proxy.password}@{new_proxy.ip}:{new_proxy.port}"
|
||||
else:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/5 10:18
|
||||
# @Desc : 基础类型
|
||||
# @Desc : Basic types
|
||||
import time
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
@@ -38,19 +38,19 @@ class IpInfoModel(BaseModel):
|
||||
"""Unified IP model"""
|
||||
|
||||
ip: str = Field(title="ip")
|
||||
port: int = Field(title="端口")
|
||||
user: str = Field(title="IP代理认证的用户名")
|
||||
protocol: str = Field(default="https://", title="代理IP的协议")
|
||||
password: str = Field(title="IP代理认证用户的密码")
|
||||
expired_time_ts: Optional[int] = Field(default=None, title="IP 过期时间")
|
||||
port: int = Field(title="port")
|
||||
user: str = Field(title="Username for IP proxy authentication")
|
||||
protocol: str = Field(default="https://", title="Protocol for proxy IP")
|
||||
password: str = Field(title="Password for IP proxy authentication user")
|
||||
expired_time_ts: Optional[int] = Field(default=None, title="IP expiration time")
|
||||
|
||||
def is_expired(self, buffer_seconds: int = 30) -> bool:
|
||||
"""
|
||||
检测代理IP是否已过期
|
||||
Check if proxy IP has expired
|
||||
Args:
|
||||
buffer_seconds: 缓冲时间(秒),提前多少秒认为已过期,避免临界时间请求失败
|
||||
buffer_seconds: Buffer time (seconds), how many seconds ahead to consider expired to avoid critical time request failures
|
||||
Returns:
|
||||
bool: True表示已过期或即将过期,False表示仍然有效
|
||||
bool: True means expired or about to expire, False means still valid
|
||||
"""
|
||||
if self.expired_time_ts is None:
|
||||
return False
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : B站存储实现类
|
||||
# @Desc : Bilibili storage implementation class
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
@@ -310,16 +310,16 @@ class BiliSqliteStoreImplement(BiliDbStoreImplement):
|
||||
|
||||
|
||||
class BiliMongoStoreImplement(AbstractStore):
|
||||
"""B站MongoDB存储实现"""
|
||||
"""Bilibili MongoDB storage implementation"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo_store = MongoDBStoreBase(collection_prefix="bilibili")
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储视频内容到MongoDB
|
||||
Store video content to MongoDB
|
||||
Args:
|
||||
content_item: 视频内容数据
|
||||
content_item: Video content data
|
||||
"""
|
||||
video_id = content_item.get("video_id")
|
||||
if not video_id:
|
||||
@@ -334,9 +334,9 @@ class BiliMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -351,9 +351,9 @@ class BiliMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储UP主信息到MongoDB
|
||||
Store UP master information to MongoDB
|
||||
Args:
|
||||
creator_item: UP主数据
|
||||
creator_item: UP master data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -368,7 +368,7 @@ class BiliMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class BiliExcelStoreImplement:
|
||||
"""B站Excel存储实现 - 全局单例"""
|
||||
"""Bilibili Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : helloteemo
|
||||
# @Time : 2024/7/12 20:01
|
||||
# @Desc : bilibili 媒体保存
|
||||
# @Desc : Bilibili media storage
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
|
||||
|
||||
@@ -50,13 +50,13 @@ class DouyinStoreFactory:
|
||||
|
||||
def _extract_note_image_list(aweme_detail: Dict) -> List[str]:
|
||||
"""
|
||||
提取笔记图片列表
|
||||
Extract note image list
|
||||
|
||||
Args:
|
||||
aweme_detail (Dict): 抖音内容详情
|
||||
aweme_detail (Dict): Douyin content details
|
||||
|
||||
Returns:
|
||||
List[str]: 笔记图片列表
|
||||
List[str]: Note image list
|
||||
"""
|
||||
images_res: List[str] = []
|
||||
images: List[Dict] = aweme_detail.get("images", [])
|
||||
@@ -65,7 +65,7 @@ def _extract_note_image_list(aweme_detail: Dict) -> List[str]:
|
||||
return []
|
||||
|
||||
for image in images:
|
||||
image_url_list = image.get("url_list", []) # download_url_list 为带水印的图片,url_list 为无水印的图片
|
||||
image_url_list = image.get("url_list", []) # download_url_list has watermarked images, url_list has non-watermarked images
|
||||
if image_url_list:
|
||||
images_res.append(image_url_list[-1])
|
||||
|
||||
@@ -74,13 +74,13 @@ def _extract_note_image_list(aweme_detail: Dict) -> List[str]:
|
||||
|
||||
def _extract_comment_image_list(comment_item: Dict) -> List[str]:
|
||||
"""
|
||||
提取评论图片列表
|
||||
Extract comment image list
|
||||
|
||||
Args:
|
||||
comment_item (Dict): 抖音评论
|
||||
comment_item (Dict): Douyin comment
|
||||
|
||||
Returns:
|
||||
List[str]: 评论图片列表
|
||||
List[str]: Comment image list
|
||||
"""
|
||||
images_res: List[str] = []
|
||||
image_list: List[Dict] = comment_item.get("image_list", [])
|
||||
@@ -98,13 +98,13 @@ def _extract_comment_image_list(comment_item: Dict) -> List[str]:
|
||||
|
||||
def _extract_content_cover_url(aweme_detail: Dict) -> str:
|
||||
"""
|
||||
提取视频封面地址
|
||||
Extract video cover URL
|
||||
|
||||
Args:
|
||||
aweme_detail (Dict): 抖音内容详情
|
||||
aweme_detail (Dict): Douyin content details
|
||||
|
||||
Returns:
|
||||
str: 视频封面地址
|
||||
str: Video cover URL
|
||||
"""
|
||||
res_cover_url = ""
|
||||
|
||||
@@ -118,13 +118,13 @@ def _extract_content_cover_url(aweme_detail: Dict) -> str:
|
||||
|
||||
def _extract_video_download_url(aweme_detail: Dict) -> str:
|
||||
"""
|
||||
提取视频下载地址
|
||||
Extract video download URL
|
||||
|
||||
Args:
|
||||
aweme_detail (Dict): 抖音视频
|
||||
aweme_detail (Dict): Douyin video
|
||||
|
||||
Returns:
|
||||
str: 视频下载地址
|
||||
str: Video download URL
|
||||
"""
|
||||
video_item = aweme_detail.get("video", {})
|
||||
url_h264_list = video_item.get("play_addr_h264", {}).get("url_list", [])
|
||||
@@ -138,13 +138,13 @@ def _extract_video_download_url(aweme_detail: Dict) -> str:
|
||||
|
||||
def _extract_music_download_url(aweme_detail: Dict) -> str:
|
||||
"""
|
||||
提取音乐下载地址
|
||||
Extract music download URL
|
||||
|
||||
Args:
|
||||
aweme_detail (Dict): 抖音视频
|
||||
aweme_detail (Dict): Douyin video
|
||||
|
||||
Returns:
|
||||
str: 音乐下载地址
|
||||
str: Music download URL
|
||||
"""
|
||||
music_item = aweme_detail.get("music", {})
|
||||
play_url = music_item.get("play_url", {})
|
||||
@@ -228,12 +228,12 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
||||
|
||||
async def save_creator(user_id: str, creator: Dict):
|
||||
user_info = creator.get("user", {})
|
||||
gender_map = {0: "未知", 1: "男", 2: "女"}
|
||||
gender_map = {0: "Unknown", 1: "Male", 2: "Female"}
|
||||
avatar_uri = user_info.get("avatar_300x300", {}).get("uri")
|
||||
local_db_item = {
|
||||
"user_id": user_id,
|
||||
"nickname": user_info.get("nickname"),
|
||||
"gender": gender_map.get(user_info.get("gender"), "未知"),
|
||||
"gender": gender_map.get(user_info.get("gender"), "Unknown"),
|
||||
"avatar": f"https://p3-pc.douyinpic.com/img/{avatar_uri}" + r"~c5_300x300.jpeg?from=2956013662",
|
||||
"desc": user_info.get("signature"),
|
||||
"ip_location": user_info.get("ip_location"),
|
||||
@@ -249,7 +249,7 @@ async def save_creator(user_id: str, creator: Dict):
|
||||
|
||||
async def update_dy_aweme_image(aweme_id, pic_content, extension_file_name):
|
||||
"""
|
||||
更新抖音笔记图片
|
||||
Update Douyin note image
|
||||
Args:
|
||||
aweme_id:
|
||||
pic_content:
|
||||
@@ -264,7 +264,7 @@ async def update_dy_aweme_image(aweme_id, pic_content, extension_file_name):
|
||||
|
||||
async def update_dy_aweme_video(aweme_id, video_content, extension_file_name):
|
||||
"""
|
||||
更新抖音短视频
|
||||
Update Douyin short video
|
||||
Args:
|
||||
aweme_id:
|
||||
video_content:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : 抖音存储实现类
|
||||
# @Desc : Douyin storage implementation class
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
@@ -209,16 +209,16 @@ class DouyinSqliteStoreImplement(DouyinDbStoreImplement):
|
||||
|
||||
|
||||
class DouyinMongoStoreImplement(AbstractStore):
|
||||
"""抖音MongoDB存储实现"""
|
||||
"""Douyin MongoDB storage implementation"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo_store = MongoDBStoreBase(collection_prefix="douyin")
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储视频内容到MongoDB
|
||||
Store video content to MongoDB
|
||||
Args:
|
||||
content_item: 视频内容数据
|
||||
content_item: Video content data
|
||||
"""
|
||||
aweme_id = content_item.get("aweme_id")
|
||||
if not aweme_id:
|
||||
@@ -233,9 +233,9 @@ class DouyinMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -250,9 +250,9 @@ class DouyinMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储创作者信息到MongoDB
|
||||
Store creator information to MongoDB
|
||||
Args:
|
||||
creator_item: 创作者数据
|
||||
creator_item: Creator data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -267,7 +267,7 @@ class DouyinMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class DouyinExcelStoreImplement:
|
||||
"""抖音Excel存储实现 - 全局单例"""
|
||||
"""Douyin Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -109,7 +109,7 @@ async def save_creator(user_id: str, creator: Dict):
|
||||
local_db_item = {
|
||||
'user_id': user_id,
|
||||
'nickname': profile.get('user_name'),
|
||||
'gender': '女' if profile.get('gender') == "F" else '男',
|
||||
'gender': 'Female' if profile.get('gender') == "F" else 'Male',
|
||||
'avatar': profile.get('headurl'),
|
||||
'desc': profile.get('user_text'),
|
||||
'ip_location': "",
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : 快手存储实现类
|
||||
# @Desc : Kuaishou storage implementation class
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
@@ -43,7 +43,7 @@ from database.mongodb_store_base import MongoDBStoreBase
|
||||
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
"""Calculate the prefix sorting number for data save files, supporting writing to different files for each run
|
||||
Args:
|
||||
file_store_path;
|
||||
Returns:
|
||||
@@ -171,16 +171,16 @@ class KuaishouSqliteStoreImplement(KuaishouDbStoreImplement):
|
||||
|
||||
|
||||
class KuaishouMongoStoreImplement(AbstractStore):
|
||||
"""快手MongoDB存储实现"""
|
||||
"""Kuaishou MongoDB storage implementation"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo_store = MongoDBStoreBase(collection_prefix="kuaishou")
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储视频内容到MongoDB
|
||||
Store video content to MongoDB
|
||||
Args:
|
||||
content_item: 视频内容数据
|
||||
content_item: Video content data
|
||||
"""
|
||||
video_id = content_item.get("video_id")
|
||||
if not video_id:
|
||||
@@ -195,9 +195,9 @@ class KuaishouMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -212,9 +212,9 @@ class KuaishouMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储创作者信息到MongoDB
|
||||
Store creator information to MongoDB
|
||||
Args:
|
||||
creator_item: 创作者数据
|
||||
creator_item: Creator data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -229,7 +229,7 @@ class KuaishouMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class KuaishouExcelStoreImplement:
|
||||
"""快手Excel存储实现 - 全局单例"""
|
||||
"""Kuaishou Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : 贴吧存储实现类
|
||||
# @Desc : Tieba storage implementation class
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
@@ -44,7 +44,7 @@ from database.mongodb_store_base import MongoDBStoreBase
|
||||
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
"""Calculate the prefix sorting number for data save files, supporting writing to different files for each run
|
||||
Args:
|
||||
file_store_path;
|
||||
Returns:
|
||||
@@ -203,16 +203,16 @@ class TieBaSqliteStoreImplement(TieBaDbStoreImplement):
|
||||
|
||||
|
||||
class TieBaMongoStoreImplement(AbstractStore):
|
||||
"""贴吧MongoDB存储实现"""
|
||||
"""Tieba MongoDB storage implementation"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo_store = MongoDBStoreBase(collection_prefix="tieba")
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储帖子内容到MongoDB
|
||||
Store post content to MongoDB
|
||||
Args:
|
||||
content_item: 帖子内容数据
|
||||
content_item: Post content data
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
if not note_id:
|
||||
@@ -227,9 +227,9 @@ class TieBaMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -244,9 +244,9 @@ class TieBaMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储创作者信息到MongoDB
|
||||
Store creator information to MongoDB
|
||||
Args:
|
||||
creator_item: 创作者数据
|
||||
creator_item: Creator data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -261,7 +261,7 @@ class TieBaMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class TieBaExcelStoreImplement:
|
||||
"""贴吧Excel存储实现 - 全局单例"""
|
||||
"""Tieba Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -188,7 +188,7 @@ async def save_creator(user_id: str, user_info: Dict):
|
||||
local_db_item = {
|
||||
'user_id': user_id,
|
||||
'nickname': user_info.get('screen_name'),
|
||||
'gender': '女' if user_info.get('gender') == "f" else '男',
|
||||
'gender': 'Female' if user_info.get('gender') == "f" else 'Male',
|
||||
'avatar': user_info.get('avatar_hd'),
|
||||
'desc': user_info.get('description'),
|
||||
'ip_location': user_info.get("source", "").replace("来自", ""),
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : 微博存储实现类
|
||||
# @Desc : Weibo storage implementation class
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
@@ -44,7 +44,7 @@ from database.mongodb_store_base import MongoDBStoreBase
|
||||
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
"""Calculate the prefix sorting number for data save files, supporting writing to different files for each run
|
||||
Args:
|
||||
file_store_path;
|
||||
Returns:
|
||||
@@ -225,16 +225,16 @@ class WeiboSqliteStoreImplement(WeiboDbStoreImplement):
|
||||
|
||||
|
||||
class WeiboMongoStoreImplement(AbstractStore):
|
||||
"""微博MongoDB存储实现"""
|
||||
"""Weibo MongoDB storage implementation"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo_store = MongoDBStoreBase(collection_prefix="weibo")
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储微博内容到MongoDB
|
||||
Store Weibo content to MongoDB
|
||||
Args:
|
||||
content_item: 微博内容数据
|
||||
content_item: Weibo content data
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
if not note_id:
|
||||
@@ -249,9 +249,9 @@ class WeiboMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -266,9 +266,9 @@ class WeiboMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储创作者信息到MongoDB
|
||||
Store creator information to MongoDB
|
||||
Args:
|
||||
creator_item: 创作者数据
|
||||
creator_item: Creator data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -283,7 +283,7 @@ class WeiboMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class WeiboExcelStoreImplement:
|
||||
"""微博Excel存储实现 - 全局单例"""
|
||||
"""Weibo Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : Erm
|
||||
# @Time : 2024/4/9 17:35
|
||||
# @Desc : 微博媒体保存
|
||||
# @Desc : Weibo media storage
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ class XhsStoreFactory:
|
||||
|
||||
def get_video_url_arr(note_item: Dict) -> List:
|
||||
"""
|
||||
获取视频url数组
|
||||
Get video url array
|
||||
Args:
|
||||
note_item:
|
||||
|
||||
@@ -64,7 +64,7 @@ def get_video_url_arr(note_item: Dict) -> List:
|
||||
originVideoKey = note_item.get('video').get('consumer').get('origin_video_key')
|
||||
if originVideoKey == '':
|
||||
originVideoKey = note_item.get('video').get('consumer').get('originVideoKey')
|
||||
# 降级有水印
|
||||
# Fallback with watermark
|
||||
if originVideoKey == '':
|
||||
videos = note_item.get('video').get('media').get('stream').get('h264')
|
||||
if type(videos).__name__ == 'list':
|
||||
@@ -77,7 +77,7 @@ def get_video_url_arr(note_item: Dict) -> List:
|
||||
|
||||
async def update_xhs_note(note_item: Dict):
|
||||
"""
|
||||
更新小红书笔记
|
||||
Update Xiaohongshu note
|
||||
Args:
|
||||
note_item:
|
||||
|
||||
@@ -97,26 +97,26 @@ async def update_xhs_note(note_item: Dict):
|
||||
video_url = ','.join(get_video_url_arr(note_item))
|
||||
|
||||
local_db_item = {
|
||||
"note_id": note_item.get("note_id"), # 帖子id
|
||||
"type": note_item.get("type"), # 帖子类型
|
||||
"title": note_item.get("title") or note_item.get("desc", "")[:255], # 帖子标题
|
||||
"desc": note_item.get("desc", ""), # 帖子描述
|
||||
"video_url": video_url, # 帖子视频url
|
||||
"time": note_item.get("time"), # 帖子发布时间
|
||||
"last_update_time": note_item.get("last_update_time", 0), # 帖子最后更新时间
|
||||
"user_id": user_info.get("user_id"), # 用户id
|
||||
"nickname": user_info.get("nickname"), # 用户昵称
|
||||
"avatar": user_info.get("avatar"), # 用户头像
|
||||
"liked_count": interact_info.get("liked_count"), # 点赞数
|
||||
"collected_count": interact_info.get("collected_count"), # 收藏数
|
||||
"comment_count": interact_info.get("comment_count"), # 评论数
|
||||
"share_count": interact_info.get("share_count"), # 分享数
|
||||
"ip_location": note_item.get("ip_location", ""), # ip地址
|
||||
"image_list": ','.join([img.get('url', '') for img in image_list]), # 图片url
|
||||
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), # 标签
|
||||
"last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳(MediaCrawler程序生成的,主要用途在db存储的时候记录一条记录最新更新时间)
|
||||
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", # 帖子url
|
||||
"source_keyword": source_keyword_var.get(), # 搜索关键词
|
||||
"note_id": note_item.get("note_id"), # Note ID
|
||||
"type": note_item.get("type"), # Note type
|
||||
"title": note_item.get("title") or note_item.get("desc", "")[:255], # Note title
|
||||
"desc": note_item.get("desc", ""), # Note description
|
||||
"video_url": video_url, # Note video url
|
||||
"time": note_item.get("time"), # Note publish time
|
||||
"last_update_time": note_item.get("last_update_time", 0), # Note last update time
|
||||
"user_id": user_info.get("user_id"), # User ID
|
||||
"nickname": user_info.get("nickname"), # User nickname
|
||||
"avatar": user_info.get("avatar"), # User avatar
|
||||
"liked_count": interact_info.get("liked_count"), # Like count
|
||||
"collected_count": interact_info.get("collected_count"), # Collection count
|
||||
"comment_count": interact_info.get("comment_count"), # Comment count
|
||||
"share_count": interact_info.get("share_count"), # Share count
|
||||
"ip_location": note_item.get("ip_location", ""), # IP location
|
||||
"image_list": ','.join([img.get('url', '') for img in image_list]), # Image URLs
|
||||
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), # Tags
|
||||
"last_modify_ts": utils.get_current_timestamp(), # Last modification timestamp (Generated by MediaCrawler, mainly used to record the latest update time of a record in DB storage)
|
||||
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", # Note URL
|
||||
"source_keyword": source_keyword_var.get(), # Search keyword
|
||||
"xsec_token": note_item.get("xsec_token"), # xsec_token
|
||||
}
|
||||
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
|
||||
@@ -125,7 +125,7 @@ async def update_xhs_note(note_item: Dict):
|
||||
|
||||
async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]):
|
||||
"""
|
||||
批量更新小红书笔记评论
|
||||
Batch update Xiaohongshu note comments
|
||||
Args:
|
||||
note_id:
|
||||
comments:
|
||||
@@ -141,7 +141,7 @@ async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]):
|
||||
|
||||
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||
"""
|
||||
更新小红书笔记评论
|
||||
Update Xiaohongshu note comment
|
||||
Args:
|
||||
note_id:
|
||||
comment_item:
|
||||
@@ -154,18 +154,18 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||
comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
|
||||
target_comment = comment_item.get("target_comment", {})
|
||||
local_db_item = {
|
||||
"comment_id": comment_id, # 评论id
|
||||
"create_time": comment_item.get("create_time"), # 评论时间
|
||||
"ip_location": comment_item.get("ip_location"), # ip地址
|
||||
"note_id": note_id, # 帖子id
|
||||
"content": comment_item.get("content"), # 评论内容
|
||||
"user_id": user_info.get("user_id"), # 用户id
|
||||
"nickname": user_info.get("nickname"), # 用户昵称
|
||||
"avatar": user_info.get("image"), # 用户头像
|
||||
"sub_comment_count": comment_item.get("sub_comment_count", 0), # 子评论数
|
||||
"pictures": ",".join(comment_pictures), # 评论图片
|
||||
"parent_comment_id": target_comment.get("id", 0), # 父评论id
|
||||
"last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳(MediaCrawler程序生成的,主要用途在db存储的时候记录一条记录最新更新时间)
|
||||
"comment_id": comment_id, # Comment ID
|
||||
"create_time": comment_item.get("create_time"), # Comment time
|
||||
"ip_location": comment_item.get("ip_location"), # IP location
|
||||
"note_id": note_id, # Note ID
|
||||
"content": comment_item.get("content"), # Comment content
|
||||
"user_id": user_info.get("user_id"), # User ID
|
||||
"nickname": user_info.get("nickname"), # User nickname
|
||||
"avatar": user_info.get("image"), # User avatar
|
||||
"sub_comment_count": comment_item.get("sub_comment_count", 0), # Sub-comment count
|
||||
"pictures": ",".join(comment_pictures), # Comment pictures
|
||||
"parent_comment_id": target_comment.get("id", 0), # Parent comment ID
|
||||
"last_modify_ts": utils.get_current_timestamp(), # Last modification timestamp (Generated by MediaCrawler, mainly used to record the latest update time of a record in DB storage)
|
||||
"like_count": comment_item.get("like_count", 0),
|
||||
}
|
||||
utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}")
|
||||
@@ -174,7 +174,7 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||
|
||||
async def save_creator(user_id: str, creator: Dict):
|
||||
"""
|
||||
保存小红书创作者
|
||||
Save Xiaohongshu creator
|
||||
Args:
|
||||
user_id:
|
||||
creator:
|
||||
@@ -197,25 +197,25 @@ async def save_creator(user_id: str, creator: Dict):
|
||||
|
||||
def get_gender(gender):
|
||||
if gender == 1:
|
||||
return '女'
|
||||
return 'Female'
|
||||
elif gender == 0:
|
||||
return '男'
|
||||
return 'Male'
|
||||
else:
|
||||
return None
|
||||
|
||||
local_db_item = {
|
||||
'user_id': user_id, # 用户id
|
||||
'nickname': user_info.get('nickname'), # 昵称
|
||||
'gender': get_gender(user_info.get('gender')), # 性别
|
||||
'avatar': user_info.get('images'), # 头像
|
||||
'desc': user_info.get('desc'), # 个人描述
|
||||
'ip_location': user_info.get('ipLocation'), # ip地址
|
||||
'follows': follows, # 关注数
|
||||
'fans': fans, # 粉丝数
|
||||
'interaction': interaction, # 互动数
|
||||
'user_id': user_id, # User ID
|
||||
'nickname': user_info.get('nickname'), # Nickname
|
||||
'gender': get_gender(user_info.get('gender')), # Gender
|
||||
'avatar': user_info.get('images'), # Avatar
|
||||
'desc': user_info.get('desc'), # Personal description
|
||||
'ip_location': user_info.get('ipLocation'), # IP location
|
||||
'follows': follows, # Following count
|
||||
'fans': fans, # Fans count
|
||||
'interaction': interaction, # Interaction count
|
||||
'tag_list': json.dumps({tag.get('tagType'): tag.get('name')
|
||||
for tag in creator.get('tags')}, ensure_ascii=False), # 标签
|
||||
"last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳(MediaCrawler程序生成的,主要用途在db存储的时候记录一条记录最新更新时间)
|
||||
for tag in creator.get('tags')}, ensure_ascii=False), # Tags
|
||||
"last_modify_ts": utils.get_current_timestamp(), # Last modification timestamp (Generated by MediaCrawler, mainly used to record the latest update time of a record in DB storage)
|
||||
}
|
||||
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
|
||||
await XhsStoreFactory.create_store().store_creator(local_db_item)
|
||||
@@ -223,7 +223,7 @@ async def save_creator(user_id: str, creator: Dict):
|
||||
|
||||
async def update_xhs_note_image(note_id, pic_content, extension_file_name):
|
||||
"""
|
||||
更新小红书笔记图片
|
||||
Update Xiaohongshu note image
|
||||
Args:
|
||||
note_id:
|
||||
pic_content:
|
||||
@@ -238,7 +238,7 @@ async def update_xhs_note_image(note_id, pic_content, extension_file_name):
|
||||
|
||||
async def update_xhs_note_video(note_id, video_content, extension_file_name):
|
||||
"""
|
||||
更新小红书笔记视频
|
||||
Update Xiaohongshu note video
|
||||
Args:
|
||||
note_id:
|
||||
video_content:
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : 小红书存储实现类
|
||||
# @Desc : Xiaohongshu storage implementation class
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
@@ -281,7 +281,7 @@ class XhsSqliteStoreImplement(XhsDbStoreImplement):
|
||||
|
||||
|
||||
class XhsMongoStoreImplement(AbstractStore):
|
||||
"""小红书MongoDB存储实现"""
|
||||
"""Xiaohongshu MongoDB storage implementation"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
@@ -289,9 +289,9 @@ class XhsMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储笔记内容到MongoDB
|
||||
Store note content to MongoDB
|
||||
Args:
|
||||
content_item: 笔记内容数据
|
||||
content_item: Note content data
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
if not note_id:
|
||||
@@ -306,9 +306,9 @@ class XhsMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -323,9 +323,9 @@ class XhsMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储创作者信息到MongoDB
|
||||
Store creator information to MongoDB
|
||||
Args:
|
||||
creator_item: 创作者数据
|
||||
creator_item: Creator data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -340,7 +340,7 @@ class XhsMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class XhsExcelStoreImplement:
|
||||
"""小红书Excel存储实现 - 全局单例"""
|
||||
"""Xiaohongshu Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : helloteemo
|
||||
# @Time : 2024/7/11 22:35
|
||||
# @Desc : 小红书媒体保存
|
||||
# @Desc : Xiaohongshu media storage
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ class ZhihuStoreFactory:
|
||||
|
||||
async def batch_update_zhihu_contents(contents: List[ZhihuContent]):
|
||||
"""
|
||||
批量更新知乎内容
|
||||
Batch update Zhihu contents
|
||||
Args:
|
||||
contents:
|
||||
|
||||
@@ -68,7 +68,7 @@ async def batch_update_zhihu_contents(contents: List[ZhihuContent]):
|
||||
|
||||
async def update_zhihu_content(content_item: ZhihuContent):
|
||||
"""
|
||||
更新知乎内容
|
||||
Update Zhihu content
|
||||
Args:
|
||||
content_item:
|
||||
|
||||
@@ -85,7 +85,7 @@ async def update_zhihu_content(content_item: ZhihuContent):
|
||||
|
||||
async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
|
||||
"""
|
||||
批量更新知乎内容评论
|
||||
Batch update Zhihu content comments
|
||||
Args:
|
||||
comments:
|
||||
|
||||
@@ -101,7 +101,7 @@ async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
|
||||
|
||||
async def update_zhihu_content_comment(comment_item: ZhihuComment):
|
||||
"""
|
||||
更新知乎内容评论
|
||||
Update Zhihu content comment
|
||||
Args:
|
||||
comment_item:
|
||||
|
||||
@@ -116,7 +116,7 @@ async def update_zhihu_content_comment(comment_item: ZhihuComment):
|
||||
|
||||
async def save_creator(creator: ZhihuCreator):
|
||||
"""
|
||||
保存知乎创作者信息
|
||||
Save Zhihu creator information
|
||||
Args:
|
||||
creator:
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : persist1@126.com
|
||||
# @Time : 2025/9/5 19:34
|
||||
# @Desc : 知乎存储实现类
|
||||
# @Desc : Zhihu storage implementation class
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
@@ -43,7 +43,7 @@ from tools.async_file_writer import AsyncFileWriter
|
||||
from database.mongodb_store_base import MongoDBStoreBase
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
"""Calculate the prefix sorting number for data save files, supporting writing to different files for each run
|
||||
Args:
|
||||
file_store_path;
|
||||
Returns:
|
||||
@@ -202,16 +202,16 @@ class ZhihuSqliteStoreImplement(ZhihuDbStoreImplement):
|
||||
|
||||
|
||||
class ZhihuMongoStoreImplement(AbstractStore):
|
||||
"""知乎MongoDB存储实现"""
|
||||
"""Zhihu MongoDB storage implementation"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo_store = MongoDBStoreBase(collection_prefix="zhihu")
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
存储内容到MongoDB
|
||||
Store content to MongoDB
|
||||
Args:
|
||||
content_item: 内容数据
|
||||
content_item: Content data
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
if not note_id:
|
||||
@@ -226,9 +226,9 @@ class ZhihuMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
存储评论到MongoDB
|
||||
Store comment to MongoDB
|
||||
Args:
|
||||
comment_item: 评论数据
|
||||
comment_item: Comment data
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
if not comment_id:
|
||||
@@ -243,9 +243,9 @@ class ZhihuMongoStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
"""
|
||||
存储创作者信息到MongoDB
|
||||
Store creator information to MongoDB
|
||||
Args:
|
||||
creator_item: 创作者数据
|
||||
creator_item: Creator data
|
||||
"""
|
||||
user_id = creator_item.get("user_id")
|
||||
if not user_id:
|
||||
@@ -260,7 +260,7 @@ class ZhihuMongoStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class ZhihuExcelStoreImplement:
|
||||
"""知乎Excel存储实现 - 全局单例"""
|
||||
"""Zhihu Excel storage implementation - Global singleton"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
|
||||
@@ -18,32 +18,32 @@
|
||||
|
||||
# @Author : persist-1<persist1@126.com>
|
||||
# @Time : 2025/9/8 00:02
|
||||
# @Desc : 用于将orm映射模型(database/models.py)与两种数据库实际结构进行对比,并进行更新操作(连接数据库->结构比对->差异报告->交互式同步)
|
||||
# @Tips : 该脚本需要安装依赖'pymysql==1.1.0'
|
||||
# @Desc : Used to compare ORM mapping model (database/models.py) with actual database structure and perform update operations (connect database -> structure comparison -> difference report -> interactive synchronization)
|
||||
# @Tips : This script requires dependency 'pymysql==1.1.0'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from sqlalchemy import create_engine, inspect as sqlalchemy_inspect
|
||||
from sqlalchemy.schema import MetaData
|
||||
|
||||
# 将项目根目录添加到 sys.path
|
||||
# Add project root directory to sys.path
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from config.db_config import mysql_db_config, sqlite_db_config
|
||||
from database.models import Base
|
||||
|
||||
def get_mysql_engine():
|
||||
"""创建并返回一个MySQL数据库引擎"""
|
||||
"""Create and return a MySQL database engine"""
|
||||
conn_str = f"mysql+pymysql://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
|
||||
return create_engine(conn_str)
|
||||
|
||||
def get_sqlite_engine():
|
||||
"""创建并返回一个SQLite数据库引擎"""
|
||||
"""Create and return a SQLite database engine"""
|
||||
conn_str = f"sqlite:///{sqlite_db_config['db_path']}"
|
||||
return create_engine(conn_str)
|
||||
|
||||
def get_db_schema(engine):
|
||||
"""获取数据库的当前表结构"""
|
||||
"""Get current table structure of the database"""
|
||||
inspector = sqlalchemy_inspect(engine)
|
||||
schema = {}
|
||||
for table_name in inspector.get_table_names():
|
||||
@@ -54,7 +54,7 @@ def get_db_schema(engine):
|
||||
return schema
|
||||
|
||||
def get_orm_schema():
|
||||
"""获取ORM模型的表结构"""
|
||||
"""Get table structure of ORM model"""
|
||||
schema = {}
|
||||
for table_name, table in Base.metadata.tables.items():
|
||||
columns = {}
|
||||
@@ -64,7 +64,7 @@ def get_orm_schema():
|
||||
return schema
|
||||
|
||||
def compare_schemas(db_schema, orm_schema):
|
||||
"""比较数据库结构和ORM模型结构,返回差异"""
|
||||
"""Compare database structure with ORM model structure and return differences"""
|
||||
db_tables = set(db_schema.keys())
|
||||
orm_tables = set(orm_schema.keys())
|
||||
|
||||
@@ -99,42 +99,42 @@ def compare_schemas(db_schema, orm_schema):
|
||||
}
|
||||
|
||||
def print_diff(db_name, diff):
|
||||
"""打印差异报告"""
|
||||
print(f"--- {db_name} 数据库结构差异报告 ---")
|
||||
"""Print difference report"""
|
||||
print(f"--- {db_name} Database Structure Difference Report ---")
|
||||
if not any(diff.values()):
|
||||
print("数据库结构与ORM模型一致,无需同步。")
|
||||
print("Database structure matches ORM model, no synchronization needed.")
|
||||
return
|
||||
|
||||
if diff.get("added_tables"):
|
||||
print("\n[+] 新增的表:")
|
||||
print("\n[+] Added tables:")
|
||||
for table in diff["added_tables"]:
|
||||
print(f" - {table}")
|
||||
|
||||
if diff.get("deleted_tables"):
|
||||
print("\n[-] 删除的表:")
|
||||
print("\n[-] Deleted tables:")
|
||||
for table in diff["deleted_tables"]:
|
||||
print(f" - {table}")
|
||||
|
||||
if diff.get("changed_tables"):
|
||||
print("\n[*] 变动的表:")
|
||||
print("\n[*] Changed tables:")
|
||||
for table, changes in diff["changed_tables"].items():
|
||||
print(f" - {table}:")
|
||||
if changes.get("added"):
|
||||
print(" [+] 新增字段:", ", ".join(changes["added"]))
|
||||
print(" [+] Added fields:", ", ".join(changes["added"]))
|
||||
if changes.get("deleted"):
|
||||
print(" [-] 删除字段:", ", ".join(changes["deleted"]))
|
||||
print(" [-] Deleted fields:", ", ".join(changes["deleted"]))
|
||||
if changes.get("modified"):
|
||||
print(" [*] 修改字段:")
|
||||
print(" [*] Modified fields:")
|
||||
for col, types in changes["modified"].items():
|
||||
print(f" - {col}: {types[0]} -> {types[1]}")
|
||||
print("--- 报告结束 ---")
|
||||
print("--- End of Report ---")
|
||||
|
||||
|
||||
def sync_database(engine, diff):
|
||||
"""将ORM模型同步到数据库"""
|
||||
"""Synchronize ORM model to database"""
|
||||
metadata = Base.metadata
|
||||
|
||||
# Alembic的上下文配置
|
||||
# Alembic context configuration
|
||||
from alembic.migration import MigrationContext
|
||||
from alembic.operations import Operations
|
||||
|
||||
@@ -142,109 +142,109 @@ def sync_database(engine, diff):
|
||||
ctx = MigrationContext.configure(conn)
|
||||
op = Operations(ctx)
|
||||
|
||||
# 处理删除的表
|
||||
# Handle deleted tables
|
||||
for table_name in diff['deleted_tables']:
|
||||
op.drop_table(table_name)
|
||||
print(f"已删除表: {table_name}")
|
||||
print(f"Deleted table: {table_name}")
|
||||
|
||||
# 处理新增的表
|
||||
# Handle added tables
|
||||
for table_name in diff['added_tables']:
|
||||
table = metadata.tables.get(table_name)
|
||||
if table is not None:
|
||||
table.create(engine)
|
||||
print(f"已创建表: {table_name}")
|
||||
print(f"Created table: {table_name}")
|
||||
|
||||
# 处理字段变更
|
||||
# Handle field changes
|
||||
for table_name, changes in diff['changed_tables'].items():
|
||||
# 删除字段
|
||||
# Delete fields
|
||||
for col_name in changes['deleted']:
|
||||
op.drop_column(table_name, col_name)
|
||||
print(f"在表 {table_name} 中已删除字段: {col_name}")
|
||||
# 新增字段
|
||||
print(f"Deleted field in table {table_name}: {col_name}")
|
||||
# Add fields
|
||||
for col_name in changes['added']:
|
||||
table = metadata.tables.get(table_name)
|
||||
column = table.columns.get(col_name)
|
||||
if column is not None:
|
||||
op.add_column(table_name, column)
|
||||
print(f"在表 {table_name} 中已新增字段: {col_name}")
|
||||
print(f"Added field in table {table_name}: {col_name}")
|
||||
|
||||
# 修改字段
|
||||
# Modify fields
|
||||
for col_name, types in changes['modified'].items():
|
||||
table = metadata.tables.get(table_name)
|
||||
if table is not None:
|
||||
column = table.columns.get(col_name)
|
||||
if column is not None:
|
||||
op.alter_column(table_name, col_name, type_=column.type)
|
||||
print(f"在表 {table_name} 中已修改字段: {col_name} (类型变为 {column.type})")
|
||||
print(f"Modified field in table {table_name}: {col_name} (type changed to {column.type})")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
"""Main function"""
|
||||
orm_schema = get_orm_schema()
|
||||
|
||||
# 处理 MySQL
|
||||
# Handle MySQL
|
||||
try:
|
||||
mysql_engine = get_mysql_engine()
|
||||
mysql_schema = get_db_schema(mysql_engine)
|
||||
mysql_diff = compare_schemas(mysql_schema, orm_schema)
|
||||
print_diff("MySQL", mysql_diff)
|
||||
if any(mysql_diff.values()):
|
||||
choice = input(">>> 需要人工确认:是否要将ORM模型同步到MySQL数据库? (y/N): ")
|
||||
choice = input(">>> Manual confirmation required: Synchronize ORM model to MySQL database? (y/N): ")
|
||||
if choice.lower() == 'y':
|
||||
sync_database(mysql_engine, mysql_diff)
|
||||
print("MySQL数据库同步完成。")
|
||||
print("MySQL database synchronization completed.")
|
||||
except Exception as e:
|
||||
print(f"处理MySQL时出错: {e}")
|
||||
print(f"Error processing MySQL: {e}")
|
||||
|
||||
|
||||
# 处理 SQLite
|
||||
# Handle SQLite
|
||||
try:
|
||||
sqlite_engine = get_sqlite_engine()
|
||||
sqlite_schema = get_db_schema(sqlite_engine)
|
||||
sqlite_diff = compare_schemas(sqlite_schema, orm_schema)
|
||||
print_diff("SQLite", sqlite_diff)
|
||||
if any(sqlite_diff.values()):
|
||||
choice = input(">>> 需要人工确认:是否要将ORM模型同步到SQLite数据库? (y/N): ")
|
||||
choice = input(">>> Manual confirmation required: Synchronize ORM model to SQLite database? (y/N): ")
|
||||
if choice.lower() == 'y':
|
||||
# 注意:SQLite不支持ALTER COLUMN来修改字段类型,这里简化处理
|
||||
print("警告:SQLite的字段修改支持有限,此脚本不会执行修改字段类型的操作。")
|
||||
# Note: SQLite does not support ALTER COLUMN to modify field types, simplified handling here
|
||||
print("Warning: SQLite has limited support for field modifications, this script will not execute field type modification operations.")
|
||||
sync_database(sqlite_engine, sqlite_diff)
|
||||
print("SQLite数据库同步完成。")
|
||||
print("SQLite database synchronization completed.")
|
||||
except Exception as e:
|
||||
print(f"处理SQLite时出错: {e}")
|
||||
print(f"Error processing SQLite: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
######################### Feedback example #########################
|
||||
# [*] 变动的表:
|
||||
# [*] Changed tables:
|
||||
# - kuaishou_video:
|
||||
# [*] 修改字段:
|
||||
# [*] Modified fields:
|
||||
# - user_id: TEXT -> VARCHAR(64)
|
||||
# - xhs_note_comment:
|
||||
# [*] 修改字段:
|
||||
# [*] Modified fields:
|
||||
# - comment_id: BIGINT -> VARCHAR(255)
|
||||
# - zhihu_content:
|
||||
# [*] 修改字段:
|
||||
# [*] Modified fields:
|
||||
# - created_time: BIGINT -> VARCHAR(32)
|
||||
# - content_id: BIGINT -> VARCHAR(64)
|
||||
# - zhihu_creator:
|
||||
# [*] 修改字段:
|
||||
# [*] Modified fields:
|
||||
# - user_id: INTEGER -> VARCHAR(64)
|
||||
# - tieba_note:
|
||||
# [*] 修改字段:
|
||||
# [*] Modified fields:
|
||||
# - publish_time: BIGINT -> VARCHAR(255)
|
||||
# - tieba_id: INTEGER -> VARCHAR(255)
|
||||
# - note_id: BIGINT -> VARCHAR(644)
|
||||
# --- 报告结束 ---
|
||||
# >>> 需要人工确认:是否要将ORM模型同步到MySQL数据库? (y/N): y
|
||||
# 在表 kuaishou_video 中已修改字段: user_id (类型变为 VARCHAR(64))
|
||||
# 在表 xhs_note_comment 中已修改字段: comment_id (类型变为 VARCHAR(255))
|
||||
# 在表 zhihu_content 中已修改字段: created_time (类型变为 VARCHAR(32))
|
||||
# 在表 zhihu_content 中已修改字段: content_id (类型变为 VARCHAR(64))
|
||||
# 在表 zhihu_creator 中已修改字段: user_id (类型变为 VARCHAR(64))
|
||||
# 在表 tieba_note 中已修改字段: publish_time (类型变为 VARCHAR(255))
|
||||
# 在表 tieba_note 中已修改字段: tieba_id (类型变为 VARCHAR(255))
|
||||
# 在表 tieba_note 中已修改字段: note_id (类型变为 VARCHAR(644))
|
||||
# MySQL数据库同步完成。
|
||||
# --- End of Report ---
|
||||
# >>> Manual confirmation required: Synchronize ORM model to MySQL database? (y/N): y
|
||||
# Modified field in table kuaishou_video: user_id (type changed to VARCHAR(64))
|
||||
# Modified field in table xhs_note_comment: comment_id (type changed to VARCHAR(255))
|
||||
# Modified field in table zhihu_content: created_time (type changed to VARCHAR(32))
|
||||
# Modified field in table zhihu_content: content_id (type changed to VARCHAR(64))
|
||||
# Modified field in table zhihu_creator: user_id (type changed to VARCHAR(64))
|
||||
# Modified field in table tieba_note: publish_time (type changed to VARCHAR(255))
|
||||
# Modified field in table tieba_note: tieba_id (type changed to VARCHAR(255))
|
||||
# Modified field in table tieba_note: note_id (type changed to VARCHAR(644))
|
||||
# MySQL database synchronization completed.
|
||||
|
||||
@@ -45,9 +45,9 @@ class TestExpiringLocalCache(unittest.TestCase):
|
||||
self.assertIsNone(self.cache.get('key'))
|
||||
|
||||
def test_clear(self):
|
||||
# 设置两个键值对,过期时间为11秒
|
||||
# Set two key-value pairs with expiration time of 11 seconds
|
||||
self.cache.set('key', 'value', 11)
|
||||
# 睡眠12秒,让cache类的定时任务执行一次
|
||||
# Sleep for 12 seconds to let the cache class's scheduled task execute once
|
||||
time.sleep(12)
|
||||
self.assertIsNone(self.cache.get('key'))
|
||||
|
||||
|
||||
@@ -38,14 +38,14 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
conn = MongoDBConnection()
|
||||
asyncio.run(conn._connect())
|
||||
cls.mongodb_available = True
|
||||
print("\n✓ MongoDB连接成功")
|
||||
print("\n✓ MongoDB connection successful")
|
||||
except Exception as e:
|
||||
cls.mongodb_available = False
|
||||
print(f"\n✗ MongoDB连接失败: {e}")
|
||||
print(f"\n✗ MongoDB connection failed: {e}")
|
||||
|
||||
def setUp(self):
|
||||
if not self.mongodb_available:
|
||||
self.skipTest("MongoDB不可用")
|
||||
self.skipTest("MongoDB not available")
|
||||
|
||||
MongoDBConnection._instance = None
|
||||
MongoDBConnection._client = None
|
||||
@@ -82,9 +82,9 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
try:
|
||||
asyncio.run(cleanup())
|
||||
print("\n✓ 测试数据清理完成")
|
||||
print("\n✓ Test data cleanup completed")
|
||||
except Exception as e:
|
||||
print(f"\n✗ 清理测试数据时出错: {e}")
|
||||
print(f"\n✗ Error cleaning up test data: {e}")
|
||||
|
||||
def test_real_connection(self):
|
||||
async def test():
|
||||
@@ -106,8 +106,8 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
test_data = {
|
||||
"note_id": "test_note_001",
|
||||
"title": "测试笔记",
|
||||
"content": "这是一条测试内容",
|
||||
"title": "Test Note",
|
||||
"content": "This is a test content",
|
||||
"created_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
@@ -125,7 +125,7 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
self.assertIsNotNone(found)
|
||||
self.assertEqual(found["note_id"], "test_note_001")
|
||||
self.assertEqual(found["title"], "测试笔记")
|
||||
self.assertEqual(found["title"], "Test Note")
|
||||
|
||||
asyncio.run(test())
|
||||
|
||||
@@ -135,7 +135,7 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
initial_data = {
|
||||
"note_id": "test_note_002",
|
||||
"title": "初始标题",
|
||||
"title": "Initial Title",
|
||||
"likes": 10
|
||||
}
|
||||
|
||||
@@ -147,7 +147,7 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
updated_data = {
|
||||
"note_id": "test_note_002",
|
||||
"title": "更新后的标题",
|
||||
"title": "Updated Title",
|
||||
"likes": 100
|
||||
}
|
||||
|
||||
@@ -162,7 +162,7 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
{"note_id": "test_note_002"}
|
||||
)
|
||||
|
||||
self.assertEqual(found["title"], "更新后的标题")
|
||||
self.assertEqual(found["title"], "Updated Title")
|
||||
self.assertEqual(found["likes"], 100)
|
||||
|
||||
asyncio.run(test())
|
||||
@@ -176,7 +176,7 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
data = {
|
||||
"note_id": f"test_note_{i:03d}",
|
||||
"user_id": test_user_id,
|
||||
"title": f"测试笔记{i}",
|
||||
"title": f"Test Note {i}",
|
||||
"likes": i * 10
|
||||
}
|
||||
await store.save_or_update(
|
||||
@@ -226,9 +226,9 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
note_data = {
|
||||
"note_id": "xhs_test_001",
|
||||
"user_id": "user_001",
|
||||
"nickname": "测试用户",
|
||||
"title": "小红书测试笔记",
|
||||
"desc": "这是一条测试笔记",
|
||||
"nickname": "Test User",
|
||||
"title": "Xiaohongshu Test Note",
|
||||
"desc": "This is a test note",
|
||||
"type": "normal",
|
||||
"liked_count": "100",
|
||||
"collected_count": "50",
|
||||
@@ -240,16 +240,16 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
"comment_id": "comment_001",
|
||||
"note_id": "xhs_test_001",
|
||||
"user_id": "user_002",
|
||||
"nickname": "评论用户",
|
||||
"content": "这是一条测试评论",
|
||||
"nickname": "Comment User",
|
||||
"content": "This is a test comment",
|
||||
"like_count": "10"
|
||||
}
|
||||
await store.store_comment(comment_data)
|
||||
|
||||
creator_data = {
|
||||
"user_id": "user_001",
|
||||
"nickname": "测试创作者",
|
||||
"desc": "这是一个测试创作者",
|
||||
"nickname": "Test Creator",
|
||||
"desc": "This is a test creator",
|
||||
"fans": "1000",
|
||||
"follows": "100"
|
||||
}
|
||||
@@ -259,15 +259,15 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
note = await mongo_store.find_one("contents", {"note_id": "xhs_test_001"})
|
||||
self.assertIsNotNone(note)
|
||||
self.assertEqual(note["title"], "小红书测试笔记")
|
||||
self.assertEqual(note["title"], "Xiaohongshu Test Note")
|
||||
|
||||
comment = await mongo_store.find_one("comments", {"comment_id": "comment_001"})
|
||||
self.assertIsNotNone(comment)
|
||||
self.assertEqual(comment["content"], "这是一条测试评论")
|
||||
self.assertEqual(comment["content"], "This is a test comment")
|
||||
|
||||
creator = await mongo_store.find_one("creators", {"user_id": "user_001"})
|
||||
self.assertIsNotNone(creator)
|
||||
self.assertEqual(creator["nickname"], "测试创作者")
|
||||
self.assertEqual(creator["nickname"], "Test Creator")
|
||||
|
||||
asyncio.run(test())
|
||||
|
||||
@@ -278,9 +278,9 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
video_data = {
|
||||
"aweme_id": "dy_test_001",
|
||||
"user_id": "user_001",
|
||||
"nickname": "测试用户",
|
||||
"title": "抖音测试视频",
|
||||
"desc": "这是一条测试视频",
|
||||
"nickname": "Test User",
|
||||
"title": "Douyin Test Video",
|
||||
"desc": "This is a test video",
|
||||
"liked_count": "1000",
|
||||
"comment_count": "100"
|
||||
}
|
||||
@@ -290,15 +290,15 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
"comment_id": "dy_comment_001",
|
||||
"aweme_id": "dy_test_001",
|
||||
"user_id": "user_002",
|
||||
"nickname": "评论用户",
|
||||
"content": "这是一条测试评论"
|
||||
"nickname": "Comment User",
|
||||
"content": "This is a test comment"
|
||||
}
|
||||
await store.store_comment(comment_data)
|
||||
|
||||
creator_data = {
|
||||
"user_id": "user_001",
|
||||
"nickname": "测试创作者",
|
||||
"desc": "这是一个测试创作者"
|
||||
"nickname": "Test Creator",
|
||||
"desc": "This is a test creator"
|
||||
}
|
||||
await store.store_creator(creator_data)
|
||||
|
||||
@@ -306,7 +306,7 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
|
||||
video = await mongo_store.find_one("contents", {"aweme_id": "dy_test_001"})
|
||||
self.assertIsNotNone(video)
|
||||
self.assertEqual(video["title"], "抖音测试视频")
|
||||
self.assertEqual(video["title"], "Douyin Test Video")
|
||||
|
||||
comment = await mongo_store.find_one("comments", {"comment_id": "dy_comment_001"})
|
||||
self.assertIsNotNone(comment)
|
||||
@@ -324,8 +324,8 @@ class TestMongoDBRealConnection(unittest.TestCase):
|
||||
for i in range(10):
|
||||
data = {
|
||||
"note_id": f"concurrent_note_{i:03d}",
|
||||
"title": f"并发测试笔记{i}",
|
||||
"content": f"内容{i}"
|
||||
"title": f"Concurrent Test Note {i}",
|
||||
"content": f"Content {i}"
|
||||
}
|
||||
task = store.save_or_update(
|
||||
"contents",
|
||||
@@ -362,9 +362,9 @@ def run_integration_tests():
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("="*70)
|
||||
print("MongoDB存储集成测试")
|
||||
print("MongoDB Storage Integration Test")
|
||||
print("="*70)
|
||||
print(f"MongoDB配置:")
|
||||
print(f"MongoDB Configuration:")
|
||||
print(f" Host: {db_config.MONGODB_HOST}")
|
||||
print(f" Port: {db_config.MONGODB_PORT}")
|
||||
print(f" Database: {db_config.MONGODB_DB_NAME}")
|
||||
@@ -373,12 +373,12 @@ if __name__ == "__main__":
|
||||
result = run_integration_tests()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("测试统计:")
|
||||
print(f"总测试数: {result.testsRun}")
|
||||
print(f"成功: {result.testsRun - len(result.failures) - len(result.errors)}")
|
||||
print(f"失败: {len(result.failures)}")
|
||||
print(f"错误: {len(result.errors)}")
|
||||
print(f"跳过: {len(result.skipped)}")
|
||||
print("Test Statistics:")
|
||||
print(f"Total tests: {result.testsRun}")
|
||||
print(f"Passed: {result.testsRun - len(result.failures) - len(result.errors)}")
|
||||
print(f"Failed: {len(result.failures)}")
|
||||
print(f"Errors: {len(result.errors)}")
|
||||
print(f"Skipped: {len(result.skipped)}")
|
||||
print("="*70)
|
||||
|
||||
sys.exit(0 if result.wasSuccessful() else 1)
|
||||
|
||||
@@ -37,109 +37,109 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
for _ in range(3):
|
||||
ip_proxy_info: IpInfoModel = await pool.get_proxy()
|
||||
print(ip_proxy_info)
|
||||
self.assertIsNotNone(ip_proxy_info.ip, msg="验证 ip 是否获取成功")
|
||||
self.assertIsNotNone(ip_proxy_info.ip, msg="Verify if IP is obtained successfully")
|
||||
|
||||
async def test_ip_expiration(self):
|
||||
"""测试IP代理过期检测功能"""
|
||||
print("\n=== 开始测试IP代理过期检测 ===")
|
||||
"""Test IP proxy expiration detection functionality"""
|
||||
print("\n=== Starting IP proxy expiration detection test ===")
|
||||
|
||||
# 1. 创建IP池并获取一个代理
|
||||
# 1. Create IP pool and get a proxy
|
||||
pool = await create_ip_pool(ip_pool_count=2, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await pool.get_proxy()
|
||||
print(f"获取到的代理: {ip_proxy_info.ip}:{ip_proxy_info.port}")
|
||||
print(f"Obtained proxy: {ip_proxy_info.ip}:{ip_proxy_info.port}")
|
||||
|
||||
# 2. 测试未过期的情况
|
||||
# 2. Test non-expired case
|
||||
if ip_proxy_info.expired_time_ts:
|
||||
print(f"代理过期时间戳: {ip_proxy_info.expired_time_ts}")
|
||||
print(f"当前时间戳: {int(time.time())}")
|
||||
print(f"剩余有效时间: {ip_proxy_info.expired_time_ts - int(time.time())} 秒")
|
||||
print(f"Proxy expiration timestamp: {ip_proxy_info.expired_time_ts}")
|
||||
print(f"Current timestamp: {int(time.time())}")
|
||||
print(f"Remaining valid time: {ip_proxy_info.expired_time_ts - int(time.time())} seconds")
|
||||
|
||||
is_expired = ip_proxy_info.is_expired(buffer_seconds=30)
|
||||
print(f"代理是否过期(缓冲30秒): {is_expired}")
|
||||
self.assertFalse(is_expired, msg="新获取的IP应该未过期")
|
||||
print(f"Is proxy expired (30s buffer): {is_expired}")
|
||||
self.assertFalse(is_expired, msg="Newly obtained IP should not be expired")
|
||||
else:
|
||||
print("当前代理未设置过期时间,跳过过期检测")
|
||||
print("Current proxy does not have expiration time set, skipping expiration detection")
|
||||
|
||||
# 3. 测试即将过期的情况(设置为5分钟后过期)
|
||||
# 3. Test about to expire case (set to expire in 5 minutes)
|
||||
current_ts = int(time.time())
|
||||
five_minutes_later = current_ts + 300 # 5分钟 = 300秒
|
||||
five_minutes_later = current_ts + 300 # 5 minutes = 300 seconds
|
||||
ip_proxy_info.expired_time_ts = five_minutes_later
|
||||
print(f"\n设置代理过期时间为5分钟后: {five_minutes_later}")
|
||||
print(f"\nSet proxy expiration time to 5 minutes later: {five_minutes_later}")
|
||||
|
||||
# 不应该过期(缓冲30秒)
|
||||
# Should not be expired (30s buffer)
|
||||
is_expired_30s = ip_proxy_info.is_expired(buffer_seconds=30)
|
||||
print(f"代理是否过期(缓冲30秒): {is_expired_30s}")
|
||||
self.assertFalse(is_expired_30s, msg="5分钟后过期的IP,缓冲30秒不应该过期")
|
||||
print(f"Is proxy expired (30s buffer): {is_expired_30s}")
|
||||
self.assertFalse(is_expired_30s, msg="IP expiring in 5 minutes should not be expired with 30s buffer")
|
||||
|
||||
# 4. 测试已过期的情况(设置为已经过期)
|
||||
expired_ts = current_ts - 60 # 1分钟前已过期
|
||||
# 4. Test already expired case (set to already expired)
|
||||
expired_ts = current_ts - 60 # Expired 1 minute ago
|
||||
ip_proxy_info.expired_time_ts = expired_ts
|
||||
print(f"\n设置代理过期时间为1分钟前: {expired_ts}")
|
||||
print(f"\nSet proxy expiration time to 1 minute ago: {expired_ts}")
|
||||
|
||||
is_expired = ip_proxy_info.is_expired(buffer_seconds=30)
|
||||
print(f"代理是否过期(缓冲30秒): {is_expired}")
|
||||
self.assertTrue(is_expired, msg="已过期的IP应该被检测为过期")
|
||||
print(f"Is proxy expired (30s buffer): {is_expired}")
|
||||
self.assertTrue(is_expired, msg="Expired IP should be detected as expired")
|
||||
|
||||
# 5. 测试临界过期情况(29秒后过期,缓冲30秒应该认为已过期)
|
||||
# 5. Test critical expiration case (expires in 29s, should be considered expired with 30s buffer)
|
||||
almost_expired_ts = current_ts + 29
|
||||
ip_proxy_info.expired_time_ts = almost_expired_ts
|
||||
print(f"\n设置代理过期时间为29秒后: {almost_expired_ts}")
|
||||
print(f"\nSet proxy expiration time to 29 seconds later: {almost_expired_ts}")
|
||||
|
||||
is_expired_critical = ip_proxy_info.is_expired(buffer_seconds=30)
|
||||
print(f"代理是否过期(缓冲30秒): {is_expired_critical}")
|
||||
self.assertTrue(is_expired_critical, msg="29秒后过期的IP,缓冲30秒应该被认为已过期")
|
||||
print(f"Is proxy expired (30s buffer): {is_expired_critical}")
|
||||
self.assertTrue(is_expired_critical, msg="IP expiring in 29s should be considered expired with 30s buffer")
|
||||
|
||||
print("\n=== IP代理过期检测测试完成 ===")
|
||||
print("\n=== IP proxy expiration detection test completed ===")
|
||||
|
||||
async def test_proxy_pool_auto_refresh(self):
|
||||
"""测试代理池自动刷新过期代理的功能"""
|
||||
print("\n=== 开始测试代理池自动刷新功能 ===")
|
||||
"""Test proxy pool auto-refresh expired proxy functionality"""
|
||||
print("\n=== Starting proxy pool auto-refresh test ===")
|
||||
|
||||
# 1. 创建IP池
|
||||
# 1. Create IP pool
|
||||
pool = await create_ip_pool(ip_pool_count=2, enable_validate_ip=True)
|
||||
|
||||
# 2. 获取一个代理
|
||||
# 2. Get a proxy
|
||||
first_proxy = await pool.get_proxy()
|
||||
print(f"首次获取代理: {first_proxy.ip}:{first_proxy.port}")
|
||||
print(f"First proxy obtained: {first_proxy.ip}:{first_proxy.port}")
|
||||
|
||||
# 验证当前代理未过期
|
||||
# Verify current proxy is not expired
|
||||
is_expired = pool.is_current_proxy_expired(buffer_seconds=30)
|
||||
print(f"当前代理是否过期: {is_expired}")
|
||||
print(f"Is current proxy expired: {is_expired}")
|
||||
|
||||
if first_proxy.expired_time_ts:
|
||||
print(f"当前代理过期时间戳: {first_proxy.expired_time_ts}")
|
||||
print(f"Current proxy expiration timestamp: {first_proxy.expired_time_ts}")
|
||||
|
||||
# 3. 手动设置当前代理为已过期
|
||||
# 3. Manually set current proxy as expired
|
||||
current_ts = int(time.time())
|
||||
pool.current_proxy.expired_time_ts = current_ts - 60
|
||||
print(f"\n手动设置代理为已过期(1分钟前)")
|
||||
print(f"\nManually set proxy as expired (1 minute ago)")
|
||||
|
||||
# 4. 检测是否过期
|
||||
# 4. Check if expired
|
||||
is_expired_after = pool.is_current_proxy_expired(buffer_seconds=30)
|
||||
print(f"设置后代理是否过期: {is_expired_after}")
|
||||
self.assertTrue(is_expired_after, msg="手动设置过期后应该被检测为过期")
|
||||
print(f"Is proxy expired after setting: {is_expired_after}")
|
||||
self.assertTrue(is_expired_after, msg="Should be detected as expired after manual setting")
|
||||
|
||||
# 5. 使用 get_or_refresh_proxy 自动刷新
|
||||
print("\n调用 get_or_refresh_proxy 自动刷新过期代理...")
|
||||
# 5. Use get_or_refresh_proxy to auto-refresh
|
||||
print("\nCalling get_or_refresh_proxy to auto-refresh expired proxy...")
|
||||
refreshed_proxy = await pool.get_or_refresh_proxy(buffer_seconds=30)
|
||||
print(f"刷新后的代理: {refreshed_proxy.ip}:{refreshed_proxy.port}")
|
||||
print(f"Refreshed proxy: {refreshed_proxy.ip}:{refreshed_proxy.port}")
|
||||
|
||||
# 6. 验证新代理未过期
|
||||
# 6. Verify new proxy is not expired
|
||||
is_new_expired = pool.is_current_proxy_expired(buffer_seconds=30)
|
||||
print(f"新代理是否过期: {is_new_expired}")
|
||||
self.assertFalse(is_new_expired, msg="刷新后的新代理应该未过期")
|
||||
print(f"Is new proxy expired: {is_new_expired}")
|
||||
self.assertFalse(is_new_expired, msg="Refreshed new proxy should not be expired")
|
||||
|
||||
print("\n=== 代理池自动刷新测试完成 ===")
|
||||
print("\n=== Proxy pool auto-refresh test completed ===")
|
||||
else:
|
||||
print("当前代理未设置过期时间,跳过自动刷新测试")
|
||||
print("Current proxy does not have expiration time set, skipping auto-refresh test")
|
||||
|
||||
async def test_ip_expiration_standalone(self):
|
||||
"""独立测试IP过期检测功能(不依赖真实代理提供商)"""
|
||||
print("\n=== 开始独立测试IP代理过期检测功能 ===")
|
||||
"""Standalone test for IP expiration detection (does not depend on real proxy provider)"""
|
||||
print("\n=== Starting standalone IP proxy expiration detection test ===")
|
||||
|
||||
current_ts = int(time.time())
|
||||
|
||||
# 1. 测试未设置过期时间的IP(永不过期)
|
||||
# 1. Test IP without expiration time set (never expires)
|
||||
ip_no_expire = IpInfoModel(
|
||||
ip="192.168.1.1",
|
||||
port=8080,
|
||||
@@ -147,14 +147,14 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
password="test_pwd",
|
||||
expired_time_ts=None
|
||||
)
|
||||
print(f"\n测试1: IP未设置过期时间")
|
||||
print(f"\nTest 1: IP without expiration time set")
|
||||
is_expired = ip_no_expire.is_expired(buffer_seconds=30)
|
||||
print(f" 代理: {ip_no_expire.ip}:{ip_no_expire.port}")
|
||||
print(f" 过期时间: {ip_no_expire.expired_time_ts}")
|
||||
print(f" 是否过期: {is_expired}")
|
||||
self.assertFalse(is_expired, msg="未设置过期时间的IP应该永不过期")
|
||||
print(f" Proxy: {ip_no_expire.ip}:{ip_no_expire.port}")
|
||||
print(f" Expiration time: {ip_no_expire.expired_time_ts}")
|
||||
print(f" Is expired: {is_expired}")
|
||||
self.assertFalse(is_expired, msg="IP without expiration time should never expire")
|
||||
|
||||
# 2. 测试5分钟后过期的IP(应该未过期)
|
||||
# 2. Test IP expiring in 5 minutes (should not be expired)
|
||||
five_minutes_later = current_ts + 300
|
||||
ip_valid = IpInfoModel(
|
||||
ip="192.168.1.2",
|
||||
@@ -163,16 +163,16 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
password="test_pwd",
|
||||
expired_time_ts=five_minutes_later
|
||||
)
|
||||
print(f"\n测试2: IP将在5分钟后过期")
|
||||
print(f"\nTest 2: IP will expire in 5 minutes")
|
||||
is_expired = ip_valid.is_expired(buffer_seconds=30)
|
||||
print(f" 代理: {ip_valid.ip}:{ip_valid.port}")
|
||||
print(f" 当前时间戳: {current_ts}")
|
||||
print(f" 过期时间戳: {ip_valid.expired_time_ts}")
|
||||
print(f" 剩余时间: {ip_valid.expired_time_ts - current_ts} 秒")
|
||||
print(f" 是否过期(缓冲30秒): {is_expired}")
|
||||
self.assertFalse(is_expired, msg="5分钟后过期的IP,缓冲30秒不应该过期")
|
||||
print(f" Proxy: {ip_valid.ip}:{ip_valid.port}")
|
||||
print(f" Current timestamp: {current_ts}")
|
||||
print(f" Expiration timestamp: {ip_valid.expired_time_ts}")
|
||||
print(f" Remaining time: {ip_valid.expired_time_ts - current_ts} seconds")
|
||||
print(f" Is expired (30s buffer): {is_expired}")
|
||||
self.assertFalse(is_expired, msg="IP expiring in 5 minutes should not be expired with 30s buffer")
|
||||
|
||||
# 3. 测试已过期的IP
|
||||
# 3. Test already expired IP
|
||||
already_expired = current_ts - 60
|
||||
ip_expired = IpInfoModel(
|
||||
ip="192.168.1.3",
|
||||
@@ -181,16 +181,16 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
password="test_pwd",
|
||||
expired_time_ts=already_expired
|
||||
)
|
||||
print(f"\n测试3: IP已经过期(1分钟前)")
|
||||
print(f"\nTest 3: IP already expired (1 minute ago)")
|
||||
is_expired = ip_expired.is_expired(buffer_seconds=30)
|
||||
print(f" 代理: {ip_expired.ip}:{ip_expired.port}")
|
||||
print(f" 当前时间戳: {current_ts}")
|
||||
print(f" 过期时间戳: {ip_expired.expired_time_ts}")
|
||||
print(f" 已过期: {current_ts - ip_expired.expired_time_ts} 秒")
|
||||
print(f" 是否过期(缓冲30秒): {is_expired}")
|
||||
self.assertTrue(is_expired, msg="已过期的IP应该被检测为过期")
|
||||
print(f" Proxy: {ip_expired.ip}:{ip_expired.port}")
|
||||
print(f" Current timestamp: {current_ts}")
|
||||
print(f" Expiration timestamp: {ip_expired.expired_time_ts}")
|
||||
print(f" Expired for: {current_ts - ip_expired.expired_time_ts} seconds")
|
||||
print(f" Is expired (30s buffer): {is_expired}")
|
||||
self.assertTrue(is_expired, msg="Expired IP should be detected as expired")
|
||||
|
||||
# 4. 测试临界过期(29秒后过期,缓冲30秒应该认为已过期)
|
||||
# 4. Test critical expiration (expires in 29s, should be considered expired with 30s buffer)
|
||||
almost_expired = current_ts + 29
|
||||
ip_critical = IpInfoModel(
|
||||
ip="192.168.1.4",
|
||||
@@ -199,16 +199,16 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
password="test_pwd",
|
||||
expired_time_ts=almost_expired
|
||||
)
|
||||
print(f"\n测试4: IP即将过期(29秒后)")
|
||||
print(f"\nTest 4: IP about to expire (in 29 seconds)")
|
||||
is_expired = ip_critical.is_expired(buffer_seconds=30)
|
||||
print(f" 代理: {ip_critical.ip}:{ip_critical.port}")
|
||||
print(f" 当前时间戳: {current_ts}")
|
||||
print(f" 过期时间戳: {ip_critical.expired_time_ts}")
|
||||
print(f" 剩余时间: {ip_critical.expired_time_ts - current_ts} 秒")
|
||||
print(f" 是否过期(缓冲30秒): {is_expired}")
|
||||
self.assertTrue(is_expired, msg="29秒后过期的IP,缓冲30秒应该被认为已过期")
|
||||
print(f" Proxy: {ip_critical.ip}:{ip_critical.port}")
|
||||
print(f" Current timestamp: {current_ts}")
|
||||
print(f" Expiration timestamp: {ip_critical.expired_time_ts}")
|
||||
print(f" Remaining time: {ip_critical.expired_time_ts - current_ts} seconds")
|
||||
print(f" Is expired (30s buffer): {is_expired}")
|
||||
self.assertTrue(is_expired, msg="IP expiring in 29s should be considered expired with 30s buffer")
|
||||
|
||||
# 5. 测试31秒后过期(缓冲30秒应该未过期)
|
||||
# 5. Test expires in 31s (should not be expired with 30s buffer)
|
||||
just_safe = current_ts + 31
|
||||
ip_just_safe = IpInfoModel(
|
||||
ip="192.168.1.5",
|
||||
@@ -217,17 +217,17 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
password="test_pwd",
|
||||
expired_time_ts=just_safe
|
||||
)
|
||||
print(f"\n测试5: IP在安全范围内(31秒后过期)")
|
||||
print(f"\nTest 5: IP within safe range (expires in 31 seconds)")
|
||||
is_expired = ip_just_safe.is_expired(buffer_seconds=30)
|
||||
print(f" 代理: {ip_just_safe.ip}:{ip_just_safe.port}")
|
||||
print(f" 当前时间戳: {current_ts}")
|
||||
print(f" 过期时间戳: {ip_just_safe.expired_time_ts}")
|
||||
print(f" 剩余时间: {ip_just_safe.expired_time_ts - current_ts} 秒")
|
||||
print(f" 是否过期(缓冲30秒): {is_expired}")
|
||||
self.assertFalse(is_expired, msg="31秒后过期的IP,缓冲30秒应该未过期")
|
||||
print(f" Proxy: {ip_just_safe.ip}:{ip_just_safe.port}")
|
||||
print(f" Current timestamp: {current_ts}")
|
||||
print(f" Expiration timestamp: {ip_just_safe.expired_time_ts}")
|
||||
print(f" Remaining time: {ip_just_safe.expired_time_ts - current_ts} seconds")
|
||||
print(f" Is expired (30s buffer): {is_expired}")
|
||||
self.assertFalse(is_expired, msg="IP expiring in 31s should not be expired with 30s buffer")
|
||||
|
||||
# 6. 测试ProxyIpPool的过期检测
|
||||
print(f"\n测试6: ProxyIpPool的过期检测功能")
|
||||
# 6. Test ProxyIpPool expiration detection
|
||||
print(f"\nTest 6: ProxyIpPool expiration detection functionality")
|
||||
mock_provider = MagicMock()
|
||||
mock_provider.get_proxy = AsyncMock(return_value=[])
|
||||
|
||||
@@ -237,35 +237,35 @@ class TestIpPool(IsolatedAsyncioTestCase):
|
||||
ip_provider=mock_provider
|
||||
)
|
||||
|
||||
# 6.1 测试无当前代理时
|
||||
# 6.1 Test when there is no current proxy
|
||||
is_expired = pool.is_current_proxy_expired(buffer_seconds=30)
|
||||
print(f" 无当前代理时是否过期: {is_expired}")
|
||||
self.assertTrue(is_expired, msg="无当前代理时应该返回True")
|
||||
print(f" Is expired when no current proxy: {is_expired}")
|
||||
self.assertTrue(is_expired, msg="Should return True when there is no current proxy")
|
||||
|
||||
# 6.2 设置一个有效的代理
|
||||
# 6.2 Set a valid proxy
|
||||
valid_proxy = IpInfoModel(
|
||||
ip="192.168.1.6",
|
||||
port=8080,
|
||||
user="test_user",
|
||||
password="test_pwd",
|
||||
expired_time_ts=current_ts + 300 # 5分钟后过期
|
||||
expired_time_ts=current_ts + 300 # Expires in 5 minutes
|
||||
)
|
||||
pool.current_proxy = valid_proxy
|
||||
is_expired = pool.is_current_proxy_expired(buffer_seconds=30)
|
||||
print(f" 设置有效代理后是否过期: {is_expired}")
|
||||
self.assertFalse(is_expired, msg="有效的代理应该返回False")
|
||||
print(f" Is expired after setting valid proxy: {is_expired}")
|
||||
self.assertFalse(is_expired, msg="Valid proxy should return False")
|
||||
|
||||
# 6.3 设置一个已过期的代理
|
||||
# 6.3 Set an expired proxy
|
||||
expired_proxy = IpInfoModel(
|
||||
ip="192.168.1.7",
|
||||
port=8080,
|
||||
user="test_user",
|
||||
password="test_pwd",
|
||||
expired_time_ts=current_ts - 60 # 1分钟前已过期
|
||||
expired_time_ts=current_ts - 60 # Expired 1 minute ago
|
||||
)
|
||||
pool.current_proxy = expired_proxy
|
||||
is_expired = pool.is_current_proxy_expired(buffer_seconds=30)
|
||||
print(f" 设置已过期代理后是否过期: {is_expired}")
|
||||
self.assertTrue(is_expired, msg="已过期的代理应该返回True")
|
||||
print(f" Is expired after setting expired proxy: {is_expired}")
|
||||
self.assertTrue(is_expired, msg="Expired proxy should return True")
|
||||
|
||||
print("\n=== 独立IP代理过期检测测试完成 ===\n")
|
||||
print("\n=== Standalone IP proxy expiration detection test completed ===\n")
|
||||
|
||||
@@ -52,7 +52,7 @@ class TestRedisCache(unittest.TestCase):
|
||||
self.assertIn('key2', keys)
|
||||
|
||||
def tearDown(self):
|
||||
# self.redis_cache._redis_client.flushdb() # 清空redis数据库
|
||||
# self.redis_cache._redis_client.flushdb() # Clear redis database
|
||||
pass
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) 2025 relakkes@gmail.com
|
||||
#
|
||||
# This file is part of MediaCrawler project.
|
||||
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tests/conftest.py
|
||||
# GitHub: https://github.com/NanmiCoder
|
||||
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||
#
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
"""
|
||||
Pytest configuration and shared fixtures
|
||||
"""
|
||||
@@ -24,23 +41,23 @@ def sample_xhs_note():
|
||||
return {
|
||||
"note_id": "test_note_123",
|
||||
"type": "normal",
|
||||
"title": "测试标题 Test Title",
|
||||
"desc": "这是一个测试描述 This is a test description",
|
||||
"title": "Test Title",
|
||||
"desc": "This is a test description",
|
||||
"video_url": "",
|
||||
"time": 1700000000,
|
||||
"last_update_time": 1700000000,
|
||||
"user_id": "user_123",
|
||||
"nickname": "测试用户",
|
||||
"nickname": "Test User",
|
||||
"avatar": "https://example.com/avatar.jpg",
|
||||
"liked_count": 100,
|
||||
"collected_count": 50,
|
||||
"comment_count": 25,
|
||||
"share_count": 10,
|
||||
"ip_location": "上海",
|
||||
"ip_location": "Shanghai",
|
||||
"image_list": "https://example.com/img1.jpg,https://example.com/img2.jpg",
|
||||
"tag_list": "测试,编程,Python",
|
||||
"tag_list": "test,programming,Python",
|
||||
"note_url": "https://www.xiaohongshu.com/explore/test_note_123",
|
||||
"source_keyword": "测试关键词",
|
||||
"source_keyword": "test keyword",
|
||||
"xsec_token": "test_token_123"
|
||||
}
|
||||
|
||||
@@ -51,11 +68,11 @@ def sample_xhs_comment():
|
||||
return {
|
||||
"comment_id": "comment_123",
|
||||
"create_time": 1700000000,
|
||||
"ip_location": "北京",
|
||||
"ip_location": "Beijing",
|
||||
"note_id": "test_note_123",
|
||||
"content": "这是一条测试评论 This is a test comment",
|
||||
"content": "This is a test comment",
|
||||
"user_id": "user_456",
|
||||
"nickname": "评论用户",
|
||||
"nickname": "Comment User",
|
||||
"avatar": "https://example.com/avatar2.jpg",
|
||||
"sub_comment_count": 5,
|
||||
"pictures": "",
|
||||
@@ -69,13 +86,13 @@ def sample_xhs_creator():
|
||||
"""Sample Xiaohongshu creator data for testing"""
|
||||
return {
|
||||
"user_id": "creator_123",
|
||||
"nickname": "创作者名称",
|
||||
"gender": "女",
|
||||
"nickname": "Creator Name",
|
||||
"gender": "Female",
|
||||
"avatar": "https://example.com/creator_avatar.jpg",
|
||||
"desc": "这是创作者简介",
|
||||
"ip_location": "广州",
|
||||
"desc": "This is the creator bio",
|
||||
"ip_location": "Guangzhou",
|
||||
"follows": 500,
|
||||
"fans": 10000,
|
||||
"interaction": 50000,
|
||||
"tag_list": '{"profession": "设计师", "interest": "摄影"}'
|
||||
"tag_list": '{"profession": "Designer", "interest": "Photography"}'
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ def run(
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(app_cleanup()), timeout=cleanup_timeout_seconds)
|
||||
except asyncio.TimeoutError:
|
||||
print(f"[Main] 清理超时({cleanup_timeout_seconds}s),跳过剩余清理。")
|
||||
print(f"[Main] Cleanup timeout ({cleanup_timeout_seconds}s), skipping remaining cleanup.")
|
||||
|
||||
async def _cancel_remaining_tasks(timeout_seconds: float = 2.0) -> None:
|
||||
current = asyncio.current_task()
|
||||
@@ -70,11 +70,11 @@ def run(
|
||||
nonlocal shutdown_requested
|
||||
|
||||
if shutdown_requested:
|
||||
print("[Main] 再次收到中断信号,强制退出。")
|
||||
print("[Main] Received interrupt signal again, force exit.")
|
||||
os._exit(force_exit_code)
|
||||
|
||||
shutdown_requested = True
|
||||
print(f"\n[Main] 收到中断信号 {signum},正在退出(清理最多{cleanup_timeout_seconds}s)...")
|
||||
print(f"\n[Main] Received interrupt signal {signum}, exiting (cleanup max {cleanup_timeout_seconds}s)...")
|
||||
|
||||
if on_first_interrupt is not None:
|
||||
try:
|
||||
@@ -100,7 +100,7 @@ def run(
|
||||
try:
|
||||
await _cleanup_with_timeout()
|
||||
except Exception as e:
|
||||
print(f"[Main] 清理时出错: {e}")
|
||||
print(f"[Main] Error during cleanup: {e}")
|
||||
await _cancel_remaining_tasks()
|
||||
|
||||
if cancelled:
|
||||
|
||||
@@ -33,8 +33,8 @@ from tools import utils
|
||||
|
||||
class BrowserLauncher:
|
||||
"""
|
||||
浏览器启动器,用于检测和启动用户的Chrome/Edge浏览器
|
||||
支持Windows和macOS系统
|
||||
Browser launcher for detecting and launching user's Chrome/Edge browser
|
||||
Supports Windows and macOS systems
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -44,19 +44,19 @@ class BrowserLauncher:
|
||||
|
||||
def detect_browser_paths(self) -> List[str]:
|
||||
"""
|
||||
检测系统中可用的浏览器路径
|
||||
返回按优先级排序的浏览器路径列表
|
||||
Detect available browser paths in system
|
||||
Returns list of browser paths sorted by priority
|
||||
"""
|
||||
paths = []
|
||||
|
||||
if self.system == "Windows":
|
||||
# Windows下的常见Chrome/Edge安装路径
|
||||
# Common Chrome/Edge installation paths on Windows
|
||||
possible_paths = [
|
||||
# Chrome路径
|
||||
# Chrome paths
|
||||
os.path.expandvars(r"%PROGRAMFILES%\Google\Chrome\Application\chrome.exe"),
|
||||
os.path.expandvars(r"%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe"),
|
||||
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe"),
|
||||
# Edge路径
|
||||
# Edge paths
|
||||
os.path.expandvars(r"%PROGRAMFILES%\Microsoft\Edge\Application\msedge.exe"),
|
||||
os.path.expandvars(r"%PROGRAMFILES(X86)%\Microsoft\Edge\Application\msedge.exe"),
|
||||
# Chrome Beta/Dev/Canary
|
||||
@@ -65,21 +65,21 @@ class BrowserLauncher:
|
||||
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome SxS\Application\chrome.exe"),
|
||||
]
|
||||
elif self.system == "Darwin": # macOS
|
||||
# macOS下的常见Chrome/Edge安装路径
|
||||
# Common Chrome/Edge installation paths on macOS
|
||||
possible_paths = [
|
||||
# Chrome路径
|
||||
# Chrome paths
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta",
|
||||
"/Applications/Google Chrome Dev.app/Contents/MacOS/Google Chrome Dev",
|
||||
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||||
# Edge路径
|
||||
# Edge paths
|
||||
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
||||
"/Applications/Microsoft Edge Beta.app/Contents/MacOS/Microsoft Edge Beta",
|
||||
"/Applications/Microsoft Edge Dev.app/Contents/MacOS/Microsoft Edge Dev",
|
||||
"/Applications/Microsoft Edge Canary.app/Contents/MacOS/Microsoft Edge Canary",
|
||||
]
|
||||
else:
|
||||
# Linux等其他系统
|
||||
# Linux and other systems
|
||||
possible_paths = [
|
||||
"/usr/bin/google-chrome",
|
||||
"/usr/bin/google-chrome-stable",
|
||||
@@ -94,7 +94,7 @@ class BrowserLauncher:
|
||||
"/usr/bin/microsoft-edge-dev",
|
||||
]
|
||||
|
||||
# 检查路径是否存在且可执行
|
||||
# Check if path exists and is executable
|
||||
for path in possible_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
paths.append(path)
|
||||
@@ -103,10 +103,10 @@ class BrowserLauncher:
|
||||
|
||||
def find_available_port(self, start_port: int = 9222) -> int:
|
||||
"""
|
||||
查找可用的端口
|
||||
Find available port
|
||||
"""
|
||||
port = start_port
|
||||
while port < start_port + 100: # 最多尝试100个端口
|
||||
while port < start_port + 100: # Try up to 100 ports
|
||||
try:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(('localhost', port))
|
||||
@@ -114,18 +114,18 @@ class BrowserLauncher:
|
||||
except OSError:
|
||||
port += 1
|
||||
|
||||
raise RuntimeError(f"无法找到可用的端口,已尝试 {start_port} 到 {port-1}")
|
||||
raise RuntimeError(f"Cannot find available port, tried {start_port} to {port-1}")
|
||||
|
||||
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
|
||||
user_data_dir: Optional[str] = None) -> subprocess.Popen:
|
||||
"""
|
||||
启动浏览器进程
|
||||
Launch browser process
|
||||
"""
|
||||
# 基本启动参数
|
||||
# Basic launch arguments
|
||||
args = [
|
||||
browser_path,
|
||||
f"--remote-debugging-port={debug_port}",
|
||||
"--remote-debugging-address=0.0.0.0", # 允许远程访问
|
||||
"--remote-debugging-address=0.0.0.0", # Allow remote access
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--disable-background-timer-throttling",
|
||||
@@ -136,36 +136,36 @@ class BrowserLauncher:
|
||||
"--disable-hang-monitor",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-sync",
|
||||
"--disable-dev-shm-usage", # 避免共享内存问题
|
||||
"--no-sandbox", # 在CDP模式下关闭沙箱
|
||||
# 🔥 关键反检测参数
|
||||
"--disable-blink-features=AutomationControlled", # 禁用自动化控制标记
|
||||
"--exclude-switches=enable-automation", # 排除自动化开关
|
||||
"--disable-infobars", # 禁用信息栏
|
||||
"--disable-dev-shm-usage", # Avoid shared memory issues
|
||||
"--no-sandbox", # Disable sandbox in CDP mode
|
||||
# Key anti-detection arguments
|
||||
"--disable-blink-features=AutomationControlled", # Disable automation control flag
|
||||
"--exclude-switches=enable-automation", # Exclude automation switch
|
||||
"--disable-infobars", # Disable info bars
|
||||
]
|
||||
|
||||
# 无头模式
|
||||
# Headless mode
|
||||
if headless:
|
||||
args.extend([
|
||||
"--headless=new", # 使用新的headless模式
|
||||
"--headless=new", # Use new headless mode
|
||||
"--disable-gpu",
|
||||
])
|
||||
else:
|
||||
# 非无头模式的额外参数
|
||||
# Extra arguments for non-headless mode
|
||||
args.extend([
|
||||
"--start-maximized", # 最大化窗口,更像真实用户
|
||||
"--start-maximized", # Maximize window, more like real user
|
||||
])
|
||||
|
||||
# 用户数据目录
|
||||
# User data directory
|
||||
if user_data_dir:
|
||||
args.append(f"--user-data-dir={user_data_dir}")
|
||||
|
||||
utils.logger.info(f"[BrowserLauncher] 启动浏览器: {browser_path}")
|
||||
utils.logger.info(f"[BrowserLauncher] 调试端口: {debug_port}")
|
||||
utils.logger.info(f"[BrowserLauncher] 无头模式: {headless}")
|
||||
utils.logger.info(f"[BrowserLauncher] Launching browser: {browser_path}")
|
||||
utils.logger.info(f"[BrowserLauncher] Debug port: {debug_port}")
|
||||
utils.logger.info(f"[BrowserLauncher] Headless mode: {headless}")
|
||||
|
||||
try:
|
||||
# 在Windows上,使用CREATE_NEW_PROCESS_GROUP避免Ctrl+C影响子进程
|
||||
# On Windows, use CREATE_NEW_PROCESS_GROUP to prevent Ctrl+C from affecting subprocess
|
||||
if self.system == "Windows":
|
||||
process = subprocess.Popen(
|
||||
args,
|
||||
@@ -178,21 +178,21 @@ class BrowserLauncher:
|
||||
args,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
preexec_fn=os.setsid # 创建新的进程组
|
||||
preexec_fn=os.setsid # Create new process group
|
||||
)
|
||||
|
||||
self.browser_process = process
|
||||
return process
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BrowserLauncher] 启动浏览器失败: {e}")
|
||||
utils.logger.error(f"[BrowserLauncher] Failed to launch browser: {e}")
|
||||
raise
|
||||
|
||||
def wait_for_browser_ready(self, debug_port: int, timeout: int = 30) -> bool:
|
||||
"""
|
||||
等待浏览器准备就绪
|
||||
Wait for browser to be ready
|
||||
"""
|
||||
utils.logger.info(f"[BrowserLauncher] 等待浏览器在端口 {debug_port} 上准备就绪...")
|
||||
utils.logger.info(f"[BrowserLauncher] Waiting for browser to be ready on port {debug_port}...")
|
||||
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
@@ -201,19 +201,19 @@ class BrowserLauncher:
|
||||
s.settimeout(1)
|
||||
result = s.connect_ex(('localhost', debug_port))
|
||||
if result == 0:
|
||||
utils.logger.info(f"[BrowserLauncher] 浏览器已在端口 {debug_port} 上准备就绪")
|
||||
utils.logger.info(f"[BrowserLauncher] Browser is ready on port {debug_port}")
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
utils.logger.error(f"[BrowserLauncher] 浏览器在 {timeout} 秒内未能准备就绪")
|
||||
utils.logger.error(f"[BrowserLauncher] Browser failed to be ready within {timeout} seconds")
|
||||
return False
|
||||
|
||||
def get_browser_info(self, browser_path: str) -> Tuple[str, str]:
|
||||
"""
|
||||
获取浏览器信息(名称和版本)
|
||||
Get browser info (name and version)
|
||||
"""
|
||||
try:
|
||||
if "chrome" in browser_path.lower():
|
||||
@@ -225,7 +225,7 @@ class BrowserLauncher:
|
||||
else:
|
||||
name = "Unknown Browser"
|
||||
|
||||
# 尝试获取版本信息
|
||||
# Try to get version info
|
||||
try:
|
||||
result = subprocess.run([browser_path, "--version"],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
@@ -240,7 +240,7 @@ class BrowserLauncher:
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
清理资源,关闭浏览器进程
|
||||
Cleanup resources, close browser process
|
||||
"""
|
||||
if not self.browser_process:
|
||||
return
|
||||
@@ -248,20 +248,20 @@ class BrowserLauncher:
|
||||
process = self.browser_process
|
||||
|
||||
if process.poll() is not None:
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已退出,无需清理")
|
||||
utils.logger.info("[BrowserLauncher] Browser process already exited, no cleanup needed")
|
||||
self.browser_process = None
|
||||
return
|
||||
|
||||
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
|
||||
utils.logger.info("[BrowserLauncher] Closing browser process...")
|
||||
|
||||
try:
|
||||
if self.system == "Windows":
|
||||
# 先尝试正常终止
|
||||
# First try normal termination
|
||||
process.terminate()
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
utils.logger.warning("[BrowserLauncher] 正常终止超时,使用taskkill强制结束")
|
||||
utils.logger.warning("[BrowserLauncher] Normal termination timeout, using taskkill to force kill")
|
||||
subprocess.run(
|
||||
["taskkill", "/F", "/T", "/PID", str(process.pid)],
|
||||
capture_output=True,
|
||||
@@ -273,17 +273,17 @@ class BrowserLauncher:
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程组不存在,可能已退出")
|
||||
utils.logger.info("[BrowserLauncher] Browser process group does not exist, may have exited")
|
||||
else:
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
utils.logger.warning("[BrowserLauncher] 优雅关闭超时,发送SIGKILL")
|
||||
utils.logger.warning("[BrowserLauncher] Graceful shutdown timeout, sending SIGKILL")
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
process.wait(timeout=5)
|
||||
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
|
||||
utils.logger.info("[BrowserLauncher] Browser process closed")
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
|
||||
utils.logger.warning(f"[BrowserLauncher] Error closing browser process: {e}")
|
||||
finally:
|
||||
self.browser_process = None
|
||||
|
||||
@@ -34,7 +34,7 @@ from tools import utils
|
||||
|
||||
class CDPBrowserManager:
|
||||
"""
|
||||
CDP浏览器管理器,负责启动和管理通过CDP连接的浏览器
|
||||
CDP browser manager, responsible for launching and managing browsers connected via CDP
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -46,27 +46,27 @@ class CDPBrowserManager:
|
||||
|
||||
def _register_cleanup_handlers(self):
|
||||
"""
|
||||
注册清理处理器,确保程序退出时清理浏览器进程
|
||||
Register cleanup handlers to ensure browser process cleanup on program exit
|
||||
"""
|
||||
if self._cleanup_registered:
|
||||
return
|
||||
|
||||
def sync_cleanup():
|
||||
"""同步清理函数,用于atexit"""
|
||||
"""Synchronous cleanup function for atexit"""
|
||||
if self.launcher and self.launcher.browser_process:
|
||||
utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程")
|
||||
utils.logger.info("[CDPBrowserManager] atexit: Cleaning up browser process")
|
||||
self.launcher.cleanup()
|
||||
|
||||
# 注册atexit清理
|
||||
# Register atexit cleanup
|
||||
atexit.register(sync_cleanup)
|
||||
|
||||
# 注册信号处理器(仅在没有自定义处理器时注册,避免覆盖主入口的信号处理逻辑)
|
||||
# Register signal handlers (only when no custom handlers exist, to avoid overriding main entry signal handling logic)
|
||||
prev_sigint = signal.getsignal(signal.SIGINT)
|
||||
prev_sigterm = signal.getsignal(signal.SIGTERM)
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""信号处理器"""
|
||||
utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程")
|
||||
"""Signal handler"""
|
||||
utils.logger.info(f"[CDPBrowserManager] Received signal {signum}, cleaning up browser process")
|
||||
if self.launcher and self.launcher.browser_process:
|
||||
self.launcher.cleanup()
|
||||
|
||||
@@ -80,19 +80,19 @@ class CDPBrowserManager:
|
||||
install_sigint = prev_sigint in (signal.default_int_handler, signal.SIG_DFL)
|
||||
install_sigterm = prev_sigterm == signal.SIG_DFL
|
||||
|
||||
# 注册SIGINT (Ctrl+C) 和 SIGTERM
|
||||
# Register SIGINT (Ctrl+C) and SIGTERM
|
||||
if install_sigint:
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
else:
|
||||
utils.logger.info("[CDPBrowserManager] 已存在SIGINT处理器,跳过注册以避免覆盖")
|
||||
utils.logger.info("[CDPBrowserManager] SIGINT handler already exists, skipping registration to avoid override")
|
||||
|
||||
if install_sigterm:
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
else:
|
||||
utils.logger.info("[CDPBrowserManager] 已存在SIGTERM处理器,跳过注册以避免覆盖")
|
||||
utils.logger.info("[CDPBrowserManager] SIGTERM handler already exists, skipping registration to avoid override")
|
||||
|
||||
self._cleanup_registered = True
|
||||
utils.logger.info("[CDPBrowserManager] 清理处理器已注册")
|
||||
utils.logger.info("[CDPBrowserManager] Cleanup handlers registered")
|
||||
|
||||
async def launch_and_connect(
|
||||
self,
|
||||
@@ -102,25 +102,25 @@ class CDPBrowserManager:
|
||||
headless: bool = False,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
启动浏览器并通过CDP连接
|
||||
Launch browser and connect via CDP
|
||||
"""
|
||||
try:
|
||||
# 1. 检测浏览器路径
|
||||
# 1. Detect browser path
|
||||
browser_path = await self._get_browser_path()
|
||||
|
||||
# 2. 获取可用端口
|
||||
# 2. Get available port
|
||||
self.debug_port = self.launcher.find_available_port(config.CDP_DEBUG_PORT)
|
||||
|
||||
# 3. 启动浏览器
|
||||
# 3. Launch browser
|
||||
await self._launch_browser(browser_path, headless)
|
||||
|
||||
# 4. 注册清理处理器(确保异常退出时也能清理)
|
||||
# 4. Register cleanup handlers (ensure cleanup on abnormal exit)
|
||||
self._register_cleanup_handlers()
|
||||
|
||||
# 5. 通过CDP连接
|
||||
# 5. Connect via CDP
|
||||
await self._connect_via_cdp(playwright)
|
||||
|
||||
# 5. 创建浏览器上下文
|
||||
# 6. Create browser context
|
||||
browser_context = await self._create_browser_context(
|
||||
playwright_proxy, user_agent
|
||||
)
|
||||
@@ -129,68 +129,68 @@ class CDPBrowserManager:
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[CDPBrowserManager] CDP浏览器启动失败: {e}")
|
||||
utils.logger.error(f"[CDPBrowserManager] CDP browser launch failed: {e}")
|
||||
await self.cleanup()
|
||||
raise
|
||||
|
||||
async def _get_browser_path(self) -> str:
|
||||
"""
|
||||
获取浏览器路径
|
||||
Get browser path
|
||||
"""
|
||||
# 优先使用用户自定义路径
|
||||
# Prefer user-defined path
|
||||
if config.CUSTOM_BROWSER_PATH and os.path.isfile(config.CUSTOM_BROWSER_PATH):
|
||||
utils.logger.info(
|
||||
f"[CDPBrowserManager] 使用自定义浏览器路径: {config.CUSTOM_BROWSER_PATH}"
|
||||
f"[CDPBrowserManager] Using custom browser path: {config.CUSTOM_BROWSER_PATH}"
|
||||
)
|
||||
return config.CUSTOM_BROWSER_PATH
|
||||
|
||||
# 自动检测浏览器路径
|
||||
# Auto-detect browser path
|
||||
browser_paths = self.launcher.detect_browser_paths()
|
||||
|
||||
if not browser_paths:
|
||||
raise RuntimeError(
|
||||
"未找到可用的浏览器。请确保已安装Chrome或Edge浏览器,"
|
||||
"或在配置文件中设置CUSTOM_BROWSER_PATH指定浏览器路径。"
|
||||
"No available browser found. Please ensure Chrome or Edge browser is installed, "
|
||||
"or set CUSTOM_BROWSER_PATH in config file to specify browser path."
|
||||
)
|
||||
|
||||
browser_path = browser_paths[0] # 使用第一个找到的浏览器
|
||||
browser_path = browser_paths[0] # Use the first browser found
|
||||
browser_name, browser_version = self.launcher.get_browser_info(browser_path)
|
||||
|
||||
utils.logger.info(
|
||||
f"[CDPBrowserManager] 检测到浏览器: {browser_name} ({browser_version})"
|
||||
f"[CDPBrowserManager] Detected browser: {browser_name} ({browser_version})"
|
||||
)
|
||||
utils.logger.info(f"[CDPBrowserManager] 浏览器路径: {browser_path}")
|
||||
utils.logger.info(f"[CDPBrowserManager] Browser path: {browser_path}")
|
||||
|
||||
return browser_path
|
||||
|
||||
async def _test_cdp_connection(self, debug_port: int) -> bool:
|
||||
"""
|
||||
测试CDP连接是否可用
|
||||
Test if CDP connection is available
|
||||
"""
|
||||
try:
|
||||
# 简单的socket连接测试
|
||||
# Simple socket connection test
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.settimeout(5)
|
||||
result = s.connect_ex(("localhost", debug_port))
|
||||
if result == 0:
|
||||
utils.logger.info(
|
||||
f"[CDPBrowserManager] CDP端口 {debug_port} 可访问"
|
||||
f"[CDPBrowserManager] CDP port {debug_port} is accessible"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] CDP端口 {debug_port} 不可访问"
|
||||
f"[CDPBrowserManager] CDP port {debug_port} is not accessible"
|
||||
)
|
||||
return False
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[CDPBrowserManager] CDP连接测试失败: {e}")
|
||||
utils.logger.warning(f"[CDPBrowserManager] CDP connection test failed: {e}")
|
||||
return False
|
||||
|
||||
async def _launch_browser(self, browser_path: str, headless: bool):
|
||||
"""
|
||||
启动浏览器进程
|
||||
Launch browser process
|
||||
"""
|
||||
# 设置用户数据目录(如果启用了保存登录状态)
|
||||
# Set user data directory (if save login state is enabled)
|
||||
user_data_dir = None
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(
|
||||
@@ -199,9 +199,9 @@ class CDPBrowserManager:
|
||||
f"cdp_{config.USER_DATA_DIR % config.PLATFORM}",
|
||||
)
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
utils.logger.info(f"[CDPBrowserManager] 用户数据目录: {user_data_dir}")
|
||||
utils.logger.info(f"[CDPBrowserManager] User data directory: {user_data_dir}")
|
||||
|
||||
# 启动浏览器
|
||||
# Launch browser
|
||||
self.launcher.browser_process = self.launcher.launch_browser(
|
||||
browser_path=browser_path,
|
||||
debug_port=self.debug_port,
|
||||
@@ -209,24 +209,24 @@ class CDPBrowserManager:
|
||||
user_data_dir=user_data_dir,
|
||||
)
|
||||
|
||||
# 等待浏览器准备就绪
|
||||
# Wait for browser to be ready
|
||||
if not self.launcher.wait_for_browser_ready(
|
||||
self.debug_port, config.BROWSER_LAUNCH_TIMEOUT
|
||||
):
|
||||
raise RuntimeError(f"浏览器在 {config.BROWSER_LAUNCH_TIMEOUT} 秒内未能启动")
|
||||
raise RuntimeError(f"Browser failed to start within {config.BROWSER_LAUNCH_TIMEOUT} seconds")
|
||||
|
||||
# 额外等待一秒让CDP服务完全启动
|
||||
# Extra wait for CDP service to fully start
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 测试CDP连接
|
||||
# Test CDP connection
|
||||
if not await self._test_cdp_connection(self.debug_port):
|
||||
utils.logger.warning(
|
||||
"[CDPBrowserManager] CDP连接测试失败,但将继续尝试连接"
|
||||
"[CDPBrowserManager] CDP connection test failed, but will continue to try connecting"
|
||||
)
|
||||
|
||||
async def _get_browser_websocket_url(self, debug_port: int) -> str:
|
||||
"""
|
||||
获取浏览器的WebSocket连接URL
|
||||
Get browser WebSocket connection URL
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
@@ -238,196 +238,196 @@ class CDPBrowserManager:
|
||||
ws_url = data.get("webSocketDebuggerUrl")
|
||||
if ws_url:
|
||||
utils.logger.info(
|
||||
f"[CDPBrowserManager] 获取到浏览器WebSocket URL: {ws_url}"
|
||||
f"[CDPBrowserManager] Got browser WebSocket URL: {ws_url}"
|
||||
)
|
||||
return ws_url
|
||||
else:
|
||||
raise RuntimeError("未找到webSocketDebuggerUrl")
|
||||
raise RuntimeError("webSocketDebuggerUrl not found")
|
||||
else:
|
||||
raise RuntimeError(f"HTTP {response.status_code}: {response.text}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[CDPBrowserManager] 获取WebSocket URL失败: {e}")
|
||||
utils.logger.error(f"[CDPBrowserManager] Failed to get WebSocket URL: {e}")
|
||||
raise
|
||||
|
||||
async def _connect_via_cdp(self, playwright: Playwright):
|
||||
"""
|
||||
通过CDP连接到浏览器
|
||||
Connect to browser via CDP
|
||||
"""
|
||||
try:
|
||||
# 获取正确的WebSocket URL
|
||||
# Get correct WebSocket URL
|
||||
ws_url = await self._get_browser_websocket_url(self.debug_port)
|
||||
utils.logger.info(f"[CDPBrowserManager] 正在通过CDP连接到浏览器: {ws_url}")
|
||||
utils.logger.info(f"[CDPBrowserManager] Connecting to browser via CDP: {ws_url}")
|
||||
|
||||
# 使用Playwright的connectOverCDP方法连接
|
||||
# Use Playwright's connectOverCDP method to connect
|
||||
self.browser = await playwright.chromium.connect_over_cdp(ws_url)
|
||||
|
||||
if self.browser.is_connected():
|
||||
utils.logger.info("[CDPBrowserManager] 成功连接到浏览器")
|
||||
utils.logger.info("[CDPBrowserManager] Successfully connected to browser")
|
||||
utils.logger.info(
|
||||
f"[CDPBrowserManager] 浏览器上下文数量: {len(self.browser.contexts)}"
|
||||
f"[CDPBrowserManager] Browser contexts count: {len(self.browser.contexts)}"
|
||||
)
|
||||
else:
|
||||
raise RuntimeError("CDP连接失败")
|
||||
raise RuntimeError("CDP connection failed")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[CDPBrowserManager] CDP连接失败: {e}")
|
||||
utils.logger.error(f"[CDPBrowserManager] CDP connection failed: {e}")
|
||||
raise
|
||||
|
||||
async def _create_browser_context(
|
||||
self, playwright_proxy: Optional[Dict] = None, user_agent: Optional[str] = None
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
创建或获取浏览器上下文
|
||||
Create or get browser context
|
||||
"""
|
||||
if not self.browser:
|
||||
raise RuntimeError("浏览器未连接")
|
||||
raise RuntimeError("Browser not connected")
|
||||
|
||||
# 获取现有上下文或创建新的上下文
|
||||
# Get existing context or create new context
|
||||
contexts = self.browser.contexts
|
||||
|
||||
if contexts:
|
||||
# 使用现有的第一个上下文
|
||||
# Use existing first context
|
||||
browser_context = contexts[0]
|
||||
utils.logger.info("[CDPBrowserManager] 使用现有的浏览器上下文")
|
||||
utils.logger.info("[CDPBrowserManager] Using existing browser context")
|
||||
else:
|
||||
# 创建新的上下文
|
||||
# Create new context
|
||||
context_options = {
|
||||
"viewport": {"width": 1920, "height": 1080},
|
||||
"accept_downloads": True,
|
||||
}
|
||||
|
||||
# 设置用户代理
|
||||
# Set user agent
|
||||
if user_agent:
|
||||
context_options["user_agent"] = user_agent
|
||||
utils.logger.info(f"[CDPBrowserManager] 设置用户代理: {user_agent}")
|
||||
utils.logger.info(f"[CDPBrowserManager] Setting user agent: {user_agent}")
|
||||
|
||||
# 注意:CDP模式下代理设置可能不生效,因为浏览器已经启动
|
||||
# Note: Proxy settings may not work in CDP mode since browser is already launched
|
||||
if playwright_proxy:
|
||||
utils.logger.warning(
|
||||
"[CDPBrowserManager] 警告: CDP模式下代理设置可能不生效,"
|
||||
"建议在浏览器启动前配置系统代理或浏览器代理扩展"
|
||||
"[CDPBrowserManager] Warning: Proxy settings may not work in CDP mode, "
|
||||
"recommend configuring system proxy or browser proxy extension before launching browser"
|
||||
)
|
||||
|
||||
browser_context = await self.browser.new_context(**context_options)
|
||||
utils.logger.info("[CDPBrowserManager] 创建新的浏览器上下文")
|
||||
utils.logger.info("[CDPBrowserManager] Created new browser context")
|
||||
|
||||
return browser_context
|
||||
|
||||
async def add_stealth_script(self, script_path: str = "libs/stealth.min.js"):
|
||||
"""
|
||||
添加反检测脚本
|
||||
Add anti-detection script
|
||||
"""
|
||||
if self.browser_context and os.path.exists(script_path):
|
||||
try:
|
||||
await self.browser_context.add_init_script(path=script_path)
|
||||
utils.logger.info(
|
||||
f"[CDPBrowserManager] 已添加反检测脚本: {script_path}"
|
||||
f"[CDPBrowserManager] Added anti-detection script: {script_path}"
|
||||
)
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[CDPBrowserManager] 添加反检测脚本失败: {e}")
|
||||
utils.logger.warning(f"[CDPBrowserManager] Failed to add anti-detection script: {e}")
|
||||
|
||||
async def add_cookies(self, cookies: list):
|
||||
"""
|
||||
添加Cookie
|
||||
Add cookies
|
||||
"""
|
||||
if self.browser_context:
|
||||
try:
|
||||
await self.browser_context.add_cookies(cookies)
|
||||
utils.logger.info(f"[CDPBrowserManager] 已添加 {len(cookies)} 个Cookie")
|
||||
utils.logger.info(f"[CDPBrowserManager] Added {len(cookies)} cookies")
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[CDPBrowserManager] 添加Cookie失败: {e}")
|
||||
utils.logger.warning(f"[CDPBrowserManager] Failed to add cookies: {e}")
|
||||
|
||||
async def get_cookies(self) -> list:
|
||||
"""
|
||||
获取当前Cookie
|
||||
Get current cookies
|
||||
"""
|
||||
if self.browser_context:
|
||||
try:
|
||||
cookies = await self.browser_context.cookies()
|
||||
return cookies
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[CDPBrowserManager] 获取Cookie失败: {e}")
|
||||
utils.logger.warning(f"[CDPBrowserManager] Failed to get cookies: {e}")
|
||||
return []
|
||||
return []
|
||||
|
||||
async def cleanup(self, force: bool = False):
|
||||
"""
|
||||
清理资源
|
||||
Cleanup resources
|
||||
|
||||
Args:
|
||||
force: 是否强制清理浏览器进程(忽略AUTO_CLOSE_BROWSER配置)
|
||||
force: Whether to force cleanup browser process (ignoring AUTO_CLOSE_BROWSER config)
|
||||
"""
|
||||
try:
|
||||
# 关闭浏览器上下文
|
||||
# Close browser context
|
||||
if self.browser_context:
|
||||
try:
|
||||
# 检查上下文是否已经关闭
|
||||
# 尝试获取页面列表,如果失败说明已经关闭
|
||||
# Check if context is already closed
|
||||
# Try to get page list, if fails means already closed
|
||||
try:
|
||||
pages = self.browser_context.pages
|
||||
if pages is not None:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
utils.logger.info("[CDPBrowserManager] Browser context closed")
|
||||
except:
|
||||
utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭")
|
||||
utils.logger.debug("[CDPBrowserManager] Browser context already closed")
|
||||
except Exception as context_error:
|
||||
# 只在错误不是因为已关闭时才记录警告
|
||||
# Only log warning if error is not due to already being closed
|
||||
error_msg = str(context_error).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
|
||||
f"[CDPBrowserManager] Failed to close browser context: {context_error}"
|
||||
)
|
||||
else:
|
||||
utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}")
|
||||
utils.logger.debug(f"[CDPBrowserManager] Browser context already closed: {context_error}")
|
||||
finally:
|
||||
self.browser_context = None
|
||||
|
||||
# 断开浏览器连接
|
||||
# Disconnect browser
|
||||
if self.browser:
|
||||
try:
|
||||
# 检查浏览器是否仍然连接
|
||||
# Check if browser is still connected
|
||||
if self.browser.is_connected():
|
||||
await self.browser.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
utils.logger.info("[CDPBrowserManager] Browser connection disconnected")
|
||||
else:
|
||||
utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开")
|
||||
utils.logger.debug("[CDPBrowserManager] Browser connection already disconnected")
|
||||
except Exception as browser_error:
|
||||
# 只在错误不是因为已关闭时才记录警告
|
||||
# Only log warning if error is not due to already being closed
|
||||
error_msg = str(browser_error).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
|
||||
f"[CDPBrowserManager] Failed to close browser connection: {browser_error}"
|
||||
)
|
||||
else:
|
||||
utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}")
|
||||
utils.logger.debug(f"[CDPBrowserManager] Browser connection already closed: {browser_error}")
|
||||
finally:
|
||||
self.browser = None
|
||||
|
||||
# 关闭浏览器进程
|
||||
# force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置
|
||||
# 这用于处理异常退出或手动清理的情况
|
||||
# Close browser process
|
||||
# force=True means force close, ignoring AUTO_CLOSE_BROWSER config
|
||||
# Used for handling abnormal exit or manual cleanup
|
||||
if force or config.AUTO_CLOSE_BROWSER:
|
||||
if self.launcher and self.launcher.browser_process:
|
||||
self.launcher.cleanup()
|
||||
else:
|
||||
utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程")
|
||||
utils.logger.debug("[CDPBrowserManager] No browser process to cleanup")
|
||||
else:
|
||||
utils.logger.info(
|
||||
"[CDPBrowserManager] 浏览器进程保持运行(AUTO_CLOSE_BROWSER=False)"
|
||||
"[CDPBrowserManager] Browser process kept running (AUTO_CLOSE_BROWSER=False)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[CDPBrowserManager] 清理资源时出错: {e}")
|
||||
utils.logger.error(f"[CDPBrowserManager] Error during resource cleanup: {e}")
|
||||
|
||||
def is_connected(self) -> bool:
|
||||
"""
|
||||
检查是否已连接到浏览器
|
||||
Check if connected to browser
|
||||
"""
|
||||
return self.browser is not None and self.browser.is_connected()
|
||||
|
||||
async def get_browser_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
获取浏览器信息
|
||||
Get browser info
|
||||
"""
|
||||
if not self.browser:
|
||||
return {}
|
||||
@@ -443,5 +443,5 @@ class CDPBrowserManager:
|
||||
"is_connected": self.is_connected(),
|
||||
}
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[CDPBrowserManager] 获取浏览器信息失败: {e}")
|
||||
utils.logger.warning(f"[CDPBrowserManager] Failed to get browser info: {e}")
|
||||
return {}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 12:53
|
||||
# @Desc : 爬虫相关的工具函数
|
||||
# @Desc : Crawler utility functions
|
||||
|
||||
import base64
|
||||
import json
|
||||
@@ -73,13 +73,13 @@ async def find_qrcode_img_from_canvas(page: Page, canvas_selector: str) -> str:
|
||||
|
||||
"""
|
||||
|
||||
# 等待Canvas元素加载完成
|
||||
# Wait for Canvas element to load
|
||||
canvas = await page.wait_for_selector(canvas_selector)
|
||||
|
||||
# 截取Canvas元素的截图
|
||||
# Take screenshot of Canvas element
|
||||
screenshot = await canvas.screenshot()
|
||||
|
||||
# 将截图转换为base64格式
|
||||
# Convert screenshot to base64 format
|
||||
base64_image = base64.b64encode(screenshot).decode('utf-8')
|
||||
return base64_image
|
||||
|
||||
@@ -185,7 +185,7 @@ def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[str]]:
|
||||
"username": ip_proxy_info.user,
|
||||
"password": ip_proxy_info.password,
|
||||
}
|
||||
# httpx 0.28.1 需要直接传入代理URL字符串,而不是字典
|
||||
# httpx 0.28.1 requires passing proxy URL string directly, not a dictionary
|
||||
if ip_proxy_info.user and ip_proxy_info.password:
|
||||
httpx_proxy = f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
||||
else:
|
||||
|
||||
@@ -17,13 +17,13 @@
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
"""
|
||||
文件头版权声明管理工具
|
||||
File header copyright declaration management tool
|
||||
|
||||
功能:
|
||||
- 自动为Python文件添加标准化的版权声明和免责声明
|
||||
- 智能检测现有文件头(编码声明、作者信息、免责声明等)
|
||||
- 在合适位置插入版权信息,不破坏现有内容
|
||||
- 支持批量处理和单文件检查模式
|
||||
Features:
|
||||
- Automatically add standardized copyright declaration and disclaimer to Python files
|
||||
- Intelligently detect existing file headers (encoding declaration, author info, disclaimer, etc.)
|
||||
- Insert copyright info at appropriate position without breaking existing content
|
||||
- Support batch processing and single file check mode
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -31,14 +31,14 @@ import re
|
||||
import sys
|
||||
from typing import List, Tuple
|
||||
|
||||
# 项目配置
|
||||
# Project configuration
|
||||
REPO_URL = "https://github.com/NanmiCoder/MediaCrawler"
|
||||
GITHUB_PROFILE = "https://github.com/NanmiCoder"
|
||||
EMAIL = "relakkes@gmail.com"
|
||||
COPYRIGHT_YEAR = "2025"
|
||||
LICENSE_TYPE = "NON-COMMERCIAL LEARNING LICENSE 1.1"
|
||||
|
||||
# 免责声明标准文本
|
||||
# Disclaimer standard text
|
||||
DISCLAIMER = """# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
@@ -52,27 +52,27 @@ DISCLAIMER = """# 声明:本代码仅供学习和研究目的使用。使用
|
||||
|
||||
def get_file_relative_path(file_path: str, project_root: str) -> str:
|
||||
"""
|
||||
获取文件相对于项目根目录的路径
|
||||
Get file path relative to project root
|
||||
|
||||
Args:
|
||||
file_path: 文件绝对路径
|
||||
project_root: 项目根目录
|
||||
file_path: File absolute path
|
||||
project_root: Project root directory
|
||||
|
||||
Returns:
|
||||
相对路径字符串
|
||||
Relative path string
|
||||
"""
|
||||
return os.path.relpath(file_path, project_root)
|
||||
|
||||
|
||||
def generate_copyright_header(relative_path: str) -> str:
|
||||
"""
|
||||
生成版权声明头部
|
||||
Generate copyright declaration header
|
||||
|
||||
Args:
|
||||
relative_path: 文件相对于项目根目录的路径
|
||||
relative_path: File path relative to project root
|
||||
|
||||
Returns:
|
||||
格式化的版权声明字符串
|
||||
Formatted copyright declaration string
|
||||
"""
|
||||
file_url = f"{REPO_URL}/blob/main/{relative_path}"
|
||||
|
||||
@@ -89,53 +89,53 @@ def generate_copyright_header(relative_path: str) -> str:
|
||||
|
||||
def has_copyright_header(content: str) -> bool:
|
||||
"""
|
||||
检查文件是否已包含版权声明
|
||||
Check if file already contains copyright declaration
|
||||
|
||||
Args:
|
||||
content: 文件内容
|
||||
content: File content
|
||||
|
||||
Returns:
|
||||
True如果已包含版权声明
|
||||
True if already contains copyright declaration
|
||||
"""
|
||||
# 检查是否包含Copyright关键字
|
||||
# Check if contains Copyright keyword
|
||||
return "Copyright (c)" in content and "MediaCrawler project" in content
|
||||
|
||||
|
||||
def has_disclaimer(content: str) -> bool:
|
||||
"""
|
||||
检查文件是否已包含免责声明
|
||||
Check if file already contains disclaimer
|
||||
|
||||
Args:
|
||||
content: 文件内容
|
||||
content: File content
|
||||
|
||||
Returns:
|
||||
True如果已包含免责声明
|
||||
True if already contains disclaimer
|
||||
"""
|
||||
return "声明:本代码仅供学习和研究目的使用" in content
|
||||
|
||||
|
||||
def find_insert_position(lines: List[str]) -> Tuple[int, bool]:
|
||||
"""
|
||||
找到插入版权声明的位置
|
||||
Find position to insert copyright declaration
|
||||
|
||||
Args:
|
||||
lines: 文件内容行列表
|
||||
lines: List of file content lines
|
||||
|
||||
Returns:
|
||||
(插入行号, 是否需要在前面添加编码声明)
|
||||
(insert line number, whether encoding declaration needs to be added)
|
||||
"""
|
||||
insert_pos = 0
|
||||
has_encoding = False
|
||||
|
||||
# 检查第一行是否是shebang
|
||||
# Check if first line is shebang
|
||||
if lines and lines[0].startswith('#!'):
|
||||
insert_pos = 1
|
||||
|
||||
# 检查编码声明(通常在第1或2行)
|
||||
# Check encoding declaration (usually on line 1 or 2)
|
||||
for i in range(insert_pos, min(insert_pos + 2, len(lines))):
|
||||
if i < len(lines):
|
||||
line = lines[i].strip()
|
||||
# 匹配 # -*- coding: utf-8 -*- 或 # coding: utf-8 等格式
|
||||
# Match # -*- coding: utf-8 -*- or # coding: utf-8 etc.
|
||||
if re.match(r'#.*coding[:=]\s*([-\w.]+)', line):
|
||||
has_encoding = True
|
||||
insert_pos = i + 1
|
||||
@@ -146,59 +146,59 @@ def find_insert_position(lines: List[str]) -> Tuple[int, bool]:
|
||||
|
||||
def process_file(file_path: str, project_root: str, dry_run: bool = False) -> Tuple[bool, str]:
|
||||
"""
|
||||
处理单个Python文件
|
||||
Process single Python file
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
project_root: 项目根目录
|
||||
dry_run: 仅检查不修改
|
||||
file_path: File path
|
||||
project_root: Project root directory
|
||||
dry_run: Check only without modification
|
||||
|
||||
Returns:
|
||||
(是否需要修改, 状态消息)
|
||||
(whether modification needed, status message)
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
lines = content.splitlines(keepends=True)
|
||||
|
||||
# 如果已经有版权声明,跳过
|
||||
# Skip if already has copyright header
|
||||
if has_copyright_header(content):
|
||||
return False, f"✓ Already has copyright header: {file_path}"
|
||||
|
||||
# 获取相对路径
|
||||
# Get relative path
|
||||
relative_path = get_file_relative_path(file_path, project_root)
|
||||
|
||||
# 生成版权声明
|
||||
# Generate copyright header
|
||||
copyright_header = generate_copyright_header(relative_path)
|
||||
|
||||
# 查找插入位置
|
||||
# Find insert position
|
||||
insert_pos, has_encoding = find_insert_position(lines)
|
||||
|
||||
# 构建新的文件内容
|
||||
# Build new file content
|
||||
new_lines = []
|
||||
|
||||
# 如果没有编码声明,添加一个
|
||||
# Add encoding declaration if not present
|
||||
if not has_encoding:
|
||||
new_lines.append("# -*- coding: utf-8 -*-\n")
|
||||
|
||||
# 添加前面的部分(shebang和编码声明)
|
||||
# Add front part (shebang and encoding declaration)
|
||||
new_lines.extend(lines[:insert_pos])
|
||||
|
||||
# 添加版权声明
|
||||
# Add copyright header
|
||||
new_lines.append(copyright_header + "\n")
|
||||
|
||||
# 如果文件没有免责声明,添加免责声明
|
||||
# Add disclaimer if file doesn't have one
|
||||
if not has_disclaimer(content):
|
||||
new_lines.append(DISCLAIMER + "\n")
|
||||
|
||||
# 添加一个空行(如果下一行不是空行)
|
||||
# Add empty line (if next line is not empty)
|
||||
if insert_pos < len(lines) and lines[insert_pos].strip():
|
||||
new_lines.append("\n")
|
||||
|
||||
# 添加剩余的内容
|
||||
# Add remaining content
|
||||
new_lines.extend(lines[insert_pos:])
|
||||
|
||||
# 如果不是dry run,写入文件
|
||||
# Write to file if not dry run
|
||||
if not dry_run:
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.writelines(new_lines)
|
||||
@@ -212,14 +212,14 @@ def process_file(file_path: str, project_root: str, dry_run: bool = False) -> Tu
|
||||
|
||||
def find_python_files(root_dir: str, exclude_patterns: List[str] = None) -> List[str]:
|
||||
"""
|
||||
查找所有Python文件
|
||||
Find all Python files
|
||||
|
||||
Args:
|
||||
root_dir: 根目录
|
||||
exclude_patterns: 排除的目录模式
|
||||
root_dir: Root directory
|
||||
exclude_patterns: Directory patterns to exclude
|
||||
|
||||
Returns:
|
||||
Python文件路径列表
|
||||
List of Python file paths
|
||||
"""
|
||||
if exclude_patterns is None:
|
||||
exclude_patterns = ['venv', '.venv', 'node_modules', '__pycache__', '.git', 'build', 'dist', '.eggs']
|
||||
@@ -227,7 +227,7 @@ def find_python_files(root_dir: str, exclude_patterns: List[str] = None) -> List
|
||||
python_files = []
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
# 排除特定目录
|
||||
# Exclude specific directories
|
||||
dirs[:] = [d for d in dirs if d not in exclude_patterns and not d.startswith('.')]
|
||||
|
||||
for file in files:
|
||||
@@ -238,39 +238,39 @@ def find_python_files(root_dir: str, exclude_patterns: List[str] = None) -> List
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
"""Main function"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Python文件头版权声明管理工具')
|
||||
parser.add_argument('files', nargs='*', help='要处理的文件路径(可选,默认处理所有.py文件)')
|
||||
parser.add_argument('--dry-run', action='store_true', help='仅检查不修改文件')
|
||||
parser.add_argument('--project-root', default=None, help='项目根目录(默认为当前目录)')
|
||||
parser.add_argument('--check', action='store_true', help='检查模式,如果有文件缺少版权声明则返回非零退出码')
|
||||
parser = argparse.ArgumentParser(description='Python file header copyright declaration management tool')
|
||||
parser.add_argument('files', nargs='*', help='File paths to process (optional, defaults to all .py files)')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Check only without modifying files')
|
||||
parser.add_argument('--project-root', default=None, help='Project root directory (defaults to current directory)')
|
||||
parser.add_argument('--check', action='store_true', help='Check mode, return non-zero exit code if files missing copyright declaration')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 确定项目根目录
|
||||
# Determine project root directory
|
||||
if args.project_root:
|
||||
project_root = os.path.abspath(args.project_root)
|
||||
else:
|
||||
# 假设此脚本在 tools/ 目录下
|
||||
# Assume this script is in tools/ directory
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
print(f"Project root: {project_root}")
|
||||
print(f"Mode: {'DRY RUN' if args.dry_run else 'UPDATE'}")
|
||||
print("-" * 60)
|
||||
|
||||
# 获取要处理的文件列表
|
||||
# Get list of files to process
|
||||
if args.files:
|
||||
# 处理指定的文件
|
||||
# Process specified files
|
||||
files_to_process = [os.path.abspath(f) for f in args.files if f.endswith('.py')]
|
||||
else:
|
||||
# 处理所有Python文件
|
||||
# Process all Python files
|
||||
files_to_process = find_python_files(project_root)
|
||||
|
||||
print(f"Found {len(files_to_process)} Python files to process\n")
|
||||
|
||||
# 处理文件
|
||||
# Process files
|
||||
updated_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
@@ -286,7 +286,7 @@ def main():
|
||||
else:
|
||||
skipped_count += 1
|
||||
|
||||
# 打印汇总
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Summary:")
|
||||
print(f" Total files: {len(files_to_process)}")
|
||||
@@ -295,7 +295,7 @@ def main():
|
||||
print(f" Errors: {error_count}")
|
||||
print("=" * 60)
|
||||
|
||||
# 如果是check模式且有文件需要更新,返回非零退出码
|
||||
# Return non-zero exit code in check mode if files need update
|
||||
if args.check and updated_count > 0:
|
||||
sys.exit(1)
|
||||
elif error_count > 0:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 12:55
|
||||
# @Desc : 滑块相关的工具包
|
||||
# @Desc : Slider verification utility package
|
||||
import os
|
||||
from typing import List
|
||||
from urllib.parse import urlparse
|
||||
@@ -38,8 +38,8 @@ class Slide:
|
||||
"""
|
||||
def __init__(self, gap, bg, gap_size=None, bg_size=None, out=None):
|
||||
"""
|
||||
:param gap: 缺口图片链接或者url
|
||||
:param bg: 带缺口的图片链接或者url
|
||||
:param gap: Gap image path or url
|
||||
:param bg: Background image with gap path or url
|
||||
"""
|
||||
self.img_dir = os.path.join(os.getcwd(), 'temp_image')
|
||||
if not os.path.exists(self.img_dir):
|
||||
@@ -76,13 +76,13 @@ class Slide:
|
||||
cv2.imwrite(img_path, image)
|
||||
return img_path
|
||||
else:
|
||||
raise Exception(f"保存{img_type}图片失败")
|
||||
raise Exception(f"Failed to save {img_type} image")
|
||||
else:
|
||||
return img
|
||||
|
||||
@staticmethod
|
||||
def clear_white(img):
|
||||
"""清除图片的空白区域,这里主要清除滑块的空白"""
|
||||
"""Clear whitespace from image, mainly clearing slider whitespace"""
|
||||
img = cv2.imread(img)
|
||||
rows, cols, channel = img.shape
|
||||
min_x = 255
|
||||
@@ -108,16 +108,16 @@ class Slide:
|
||||
def template_match(self, tpl, target):
|
||||
th, tw = tpl.shape[:2]
|
||||
result = cv2.matchTemplate(target, tpl, cv2.TM_CCOEFF_NORMED)
|
||||
# 寻找矩阵(一维数组当作向量,用Mat定义) 中最小值和最大值的位置
|
||||
# Find min and max value positions in matrix
|
||||
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
|
||||
tl = max_loc
|
||||
br = (tl[0] + tw, tl[1] + th)
|
||||
# 绘制矩形边框,将匹配区域标注出来
|
||||
# target:目标图像
|
||||
# tl:矩形定点
|
||||
# br:矩形的宽高
|
||||
# (0,0,255):矩形边框颜色
|
||||
# 1:矩形边框大小
|
||||
# Draw rectangle border to mark the matched area
|
||||
# target: target image
|
||||
# tl: rectangle top-left corner
|
||||
# br: rectangle width and height
|
||||
# (0,0,255): rectangle border color
|
||||
# 1: rectangle border size
|
||||
cv2.rectangle(target, tl, br, (0, 0, 255), 2)
|
||||
cv2.imwrite(self.out, target)
|
||||
return tl[0]
|
||||
@@ -138,39 +138,39 @@ class Slide:
|
||||
slide_pic = cv2.cvtColor(slide, cv2.COLOR_GRAY2RGB)
|
||||
back_pic = cv2.cvtColor(back, cv2.COLOR_GRAY2RGB)
|
||||
x = self.template_match(slide_pic, back_pic)
|
||||
# 输出横坐标, 即 滑块在图片上的位置
|
||||
# Output x-coordinate, i.e., slider position on image
|
||||
return x
|
||||
|
||||
|
||||
def get_track_simple(distance) -> List[int]:
|
||||
# 有的检测移动速度的 如果匀速移动会被识别出来,来个简单点的 渐进
|
||||
# distance为传入的总距离
|
||||
# 移动轨迹
|
||||
# Some detection checks movement speed - constant speed will be detected, so use gradual acceleration
|
||||
# distance is the total distance to move
|
||||
# Movement track
|
||||
track: List[int] = []
|
||||
# 当前位移
|
||||
# Current displacement
|
||||
current = 0
|
||||
# 减速阈值
|
||||
# Deceleration threshold
|
||||
mid = distance * 4 / 5
|
||||
# 计算间隔
|
||||
# Time interval
|
||||
t = 0.2
|
||||
# 初速度
|
||||
# Initial velocity
|
||||
v = 1
|
||||
|
||||
while current < distance:
|
||||
if current < mid:
|
||||
# 加速度为2
|
||||
# Acceleration = 4
|
||||
a = 4
|
||||
else:
|
||||
# 加速度为-2
|
||||
# Acceleration = -3
|
||||
a = -3
|
||||
v0 = v
|
||||
# 当前速度
|
||||
# Current velocity
|
||||
v = v0 + a * t # type: ignore
|
||||
# 移动距离
|
||||
# Movement distance
|
||||
move = v0 * t + 1 / 2 * a * t * t
|
||||
# 当前位移
|
||||
# Current displacement
|
||||
current += move # type: ignore
|
||||
# 加入轨迹
|
||||
# Add to track
|
||||
track.append(round(move))
|
||||
return track
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 12:52
|
||||
# @Desc : 时间相关的工具函数
|
||||
# @Desc : Time utility functions
|
||||
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
@@ -29,7 +29,7 @@ from datetime import datetime, timedelta, timezone
|
||||
|
||||
def get_current_timestamp() -> int:
|
||||
"""
|
||||
获取当前的时间戳(13 位):1701493264496
|
||||
Get current timestamp (13 digits): 1701493264496
|
||||
:return:
|
||||
"""
|
||||
return int(time.time() * 1000)
|
||||
@@ -37,21 +37,21 @@ def get_current_timestamp() -> int:
|
||||
|
||||
def get_current_time() -> str:
|
||||
"""
|
||||
获取当前的时间:'2023-12-02 13:01:23'
|
||||
Get current time: '2023-12-02 13:01:23'
|
||||
:return:
|
||||
"""
|
||||
return time.strftime('%Y-%m-%d %X', time.localtime())
|
||||
|
||||
def get_current_time_hour() -> str:
|
||||
"""
|
||||
获取当前的时间:'2023-12-02-13'
|
||||
Get current time with hour: '2023-12-02-13'
|
||||
:return:
|
||||
"""
|
||||
return time.strftime('%Y-%m-%d-%H', time.localtime())
|
||||
|
||||
def get_current_date() -> str:
|
||||
"""
|
||||
获取当前的日期:'2023-12-02'
|
||||
Get current date: '2023-12-02'
|
||||
:return:
|
||||
"""
|
||||
return time.strftime('%Y-%m-%d', time.localtime())
|
||||
@@ -59,7 +59,7 @@ def get_current_date() -> str:
|
||||
|
||||
def get_time_str_from_unix_time(unixtime):
|
||||
"""
|
||||
unix 整数类型时间戳 ==> 字符串日期时间
|
||||
Unix integer timestamp ==> datetime string
|
||||
:param unixtime:
|
||||
:return:
|
||||
"""
|
||||
@@ -70,7 +70,7 @@ def get_time_str_from_unix_time(unixtime):
|
||||
|
||||
def get_date_str_from_unix_time(unixtime):
|
||||
"""
|
||||
unix 整数类型时间戳 ==> 字符串日期
|
||||
Unix integer timestamp ==> date string
|
||||
:param unixtime:
|
||||
:return:
|
||||
"""
|
||||
@@ -81,7 +81,7 @@ def get_date_str_from_unix_time(unixtime):
|
||||
|
||||
def get_unix_time_from_time_str(time_str):
|
||||
"""
|
||||
字符串时间 ==> unix 整数类型时间戳,精确到秒
|
||||
Time string ==> Unix integer timestamp, precise to seconds
|
||||
:param time_str:
|
||||
:return:
|
||||
"""
|
||||
@@ -99,34 +99,34 @@ def get_unix_timestamp():
|
||||
|
||||
|
||||
def rfc2822_to_china_datetime(rfc2822_time):
|
||||
# 定义RFC 2822格式
|
||||
# Define RFC 2822 format
|
||||
rfc2822_format = "%a %b %d %H:%M:%S %z %Y"
|
||||
|
||||
# 将RFC 2822时间字符串转换为datetime对象
|
||||
# Convert RFC 2822 time string to datetime object
|
||||
dt_object = datetime.strptime(rfc2822_time, rfc2822_format)
|
||||
|
||||
# 将datetime对象的时区转换为中国时区
|
||||
# Convert datetime object timezone to China timezone
|
||||
dt_object_china = dt_object.astimezone(timezone(timedelta(hours=8)))
|
||||
return dt_object_china
|
||||
|
||||
|
||||
def rfc2822_to_timestamp(rfc2822_time):
|
||||
# 定义RFC 2822格式
|
||||
# Define RFC 2822 format
|
||||
rfc2822_format = "%a %b %d %H:%M:%S %z %Y"
|
||||
|
||||
# 将RFC 2822时间字符串转换为datetime对象
|
||||
# Convert RFC 2822 time string to datetime object
|
||||
dt_object = datetime.strptime(rfc2822_time, rfc2822_format)
|
||||
|
||||
# 将datetime对象转换为UTC时间
|
||||
# Convert datetime object to UTC time
|
||||
dt_utc = dt_object.replace(tzinfo=timezone.utc)
|
||||
|
||||
# 计算UTC时间对应的Unix时间戳
|
||||
# Calculate Unix timestamp from UTC time
|
||||
timestamp = int(dt_utc.timestamp())
|
||||
|
||||
return timestamp
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 示例用法
|
||||
# Example usage
|
||||
_rfc2822_time = "Sat Dec 23 17:12:54 +0800 2023"
|
||||
print(rfc2822_to_china_datetime(_rfc2822_time))
|
||||
|
||||
@@ -36,7 +36,7 @@ def init_loging_config():
|
||||
_logger = logging.getLogger("MediaCrawler")
|
||||
_logger.setLevel(level)
|
||||
|
||||
# 关闭 httpx 的 INFO 日志
|
||||
# Disable httpx INFO level logs
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
return _logger
|
||||
|
||||
Reference in New Issue
Block a user