feat: cdp browser cleanup after crawler done

This commit is contained in:
程序员阿江(Relakkes)
2025-11-17 12:21:53 +08:00
parent a1c5e07df8
commit e89a6d5781
2 changed files with 136 additions and 20 deletions

58
main.py
View File

@@ -11,6 +11,7 @@
import asyncio
import sys
import signal
from typing import Optional
import cmd_arg
@@ -87,15 +88,62 @@ async def main():
print(f"Error generating wordcloud: {e}")
def cleanup():
if crawler:
pass
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
asyncio.run(db.close())
async def async_cleanup():
"""异步清理函数用于处理CDP浏览器等异步资源"""
global crawler
if crawler:
# 检查并清理CDP浏览器
if hasattr(crawler, 'cdp_manager') and crawler.cdp_manager:
try:
await crawler.cdp_manager.cleanup(force=True) # 强制清理浏览器进程
except Exception as e:
# 只在非预期错误时打印
error_msg = str(e).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
print(f"[Main] 清理CDP浏览器时出错: {e}")
# 检查并清理标准浏览器上下文仅在非CDP模式下
elif hasattr(crawler, 'browser_context') and crawler.browser_context:
try:
# 检查上下文是否仍然打开
if hasattr(crawler.browser_context, 'pages'):
await crawler.browser_context.close()
except Exception as e:
# 只在非预期错误时打印
error_msg = str(e).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
print(f"[Main] 关闭浏览器上下文时出错: {e}")
# 关闭数据库连接
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
await db.close()
def cleanup():
"""同步清理函数"""
try:
# 创建新的事件循环来执行异步清理
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(async_cleanup())
loop.close()
except Exception as e:
print(f"[Main] 清理时出错: {e}")
def signal_handler(signum, _frame):
"""信号处理器处理Ctrl+C等中断信号"""
print(f"\n[Main] 收到中断信号 {signum},正在清理资源...")
cleanup()
sys.exit(0)
if __name__ == "__main__":
# 注册信号处理器
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
signal.signal(signal.SIGTERM, signal_handler) # 终止信号
try:
asyncio.get_event_loop().run_until_complete(main())
except KeyboardInterrupt:
print("\n[Main] 收到键盘中断,正在清理资源...")
finally:
cleanup()

View File

@@ -13,6 +13,8 @@ import os
import asyncio
import socket
import httpx
import signal
import atexit
from typing import Optional, Dict, Any
from playwright.async_api import Browser, BrowserContext, Playwright
@@ -31,6 +33,40 @@ class CDPBrowserManager:
self.browser: Optional[Browser] = None
self.browser_context: Optional[BrowserContext] = None
self.debug_port: Optional[int] = None
self._cleanup_registered = False
def _register_cleanup_handlers(self):
"""
注册清理处理器,确保程序退出时清理浏览器进程
"""
if self._cleanup_registered:
return
def sync_cleanup():
"""同步清理函数用于atexit"""
if self.launcher and self.launcher.browser_process:
utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程")
self.launcher.cleanup()
# 注册atexit清理
atexit.register(sync_cleanup)
# 注册信号处理器
def signal_handler(signum, frame):
"""信号处理器"""
utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程")
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
# 重新引发KeyboardInterrupt以便正常退出流程
if signum == signal.SIGINT:
raise KeyboardInterrupt
# 注册SIGINT (Ctrl+C) 和 SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
self._cleanup_registered = True
utils.logger.info("[CDPBrowserManager] 清理处理器已注册")
async def launch_and_connect(
self,
@@ -52,7 +88,10 @@ class CDPBrowserManager:
# 3. 启动浏览器
await self._launch_browser(browser_path, headless)
# 4. 通过CDP连接
# 4. 注册清理处理器(确保异常退出时也能清理)
self._register_cleanup_handlers()
# 5. 通过CDP连接
await self._connect_via_cdp(playwright)
# 5. 创建浏览器上下文
@@ -285,38 +324,67 @@ class CDPBrowserManager:
return []
return []
async def cleanup(self):
async def cleanup(self, force: bool = False):
"""
清理资源
Args:
force: 是否强制清理浏览器进程忽略AUTO_CLOSE_BROWSER配置
"""
try:
# 关闭浏览器上下文
if self.browser_context:
try:
await self.browser_context.close()
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
# 检查上下文是否已经关闭
# 尝试获取页面列表,如果失败说明已经关闭
try:
pages = self.browser_context.pages
if pages is not None:
await self.browser_context.close()
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
except:
utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭")
except Exception as context_error:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
)
# 只在错误不是因为已关闭时才记录警告
error_msg = str(context_error).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}")
finally:
self.browser_context = None
# 断开浏览器连接
if self.browser:
try:
await self.browser.close()
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
# 检查浏览器是否仍然连接
if self.browser.is_connected():
await self.browser.close()
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
else:
utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开")
except Exception as browser_error:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
)
# 只在错误不是因为已关闭时才记录警告
error_msg = str(browser_error).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}")
finally:
self.browser = None
# 关闭浏览器进程(如果配置为自动关闭)
if config.AUTO_CLOSE_BROWSER:
self.launcher.cleanup()
# 关闭浏览器进程
# force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置
# 这用于处理异常退出或手动清理的情况
if force or config.AUTO_CLOSE_BROWSER:
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
else:
utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程")
else:
utils.logger.info(
"[CDPBrowserManager] 浏览器进程保持运行AUTO_CLOSE_BROWSER=False"