feat: cdp browser cleanup after crawler done

This commit is contained in:
程序员阿江(Relakkes)
2025-11-17 12:21:53 +08:00
parent a1c5e07df8
commit e89a6d5781
2 changed files with 136 additions and 20 deletions

58
main.py
View File

@@ -11,6 +11,7 @@
import asyncio import asyncio
import sys import sys
import signal
from typing import Optional from typing import Optional
import cmd_arg import cmd_arg
@@ -87,15 +88,62 @@ async def main():
print(f"Error generating wordcloud: {e}") print(f"Error generating wordcloud: {e}")
def cleanup(): async def async_cleanup():
if crawler: """异步清理函数用于处理CDP浏览器等异步资源"""
pass global crawler
if config.SAVE_DATA_OPTION in ["db", "sqlite"]: if crawler:
asyncio.run(db.close()) # 检查并清理CDP浏览器
if hasattr(crawler, 'cdp_manager') and crawler.cdp_manager:
try:
await crawler.cdp_manager.cleanup(force=True) # 强制清理浏览器进程
except Exception as e:
# 只在非预期错误时打印
error_msg = str(e).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
print(f"[Main] 清理CDP浏览器时出错: {e}")
# 检查并清理标准浏览器上下文仅在非CDP模式下
elif hasattr(crawler, 'browser_context') and crawler.browser_context:
try:
# 检查上下文是否仍然打开
if hasattr(crawler.browser_context, 'pages'):
await crawler.browser_context.close()
except Exception as e:
# 只在非预期错误时打印
error_msg = str(e).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
print(f"[Main] 关闭浏览器上下文时出错: {e}")
# 关闭数据库连接
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
await db.close()
def cleanup():
"""同步清理函数"""
try:
# 创建新的事件循环来执行异步清理
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(async_cleanup())
loop.close()
except Exception as e:
print(f"[Main] 清理时出错: {e}")
def signal_handler(signum, _frame):
"""信号处理器处理Ctrl+C等中断信号"""
print(f"\n[Main] 收到中断信号 {signum},正在清理资源...")
cleanup()
sys.exit(0)
if __name__ == "__main__": if __name__ == "__main__":
# 注册信号处理器
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
signal.signal(signal.SIGTERM, signal_handler) # 终止信号
try: try:
asyncio.get_event_loop().run_until_complete(main()) asyncio.get_event_loop().run_until_complete(main())
except KeyboardInterrupt:
print("\n[Main] 收到键盘中断,正在清理资源...")
finally: finally:
cleanup() cleanup()

View File

@@ -13,6 +13,8 @@ import os
import asyncio import asyncio
import socket import socket
import httpx import httpx
import signal
import atexit
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
from playwright.async_api import Browser, BrowserContext, Playwright from playwright.async_api import Browser, BrowserContext, Playwright
@@ -31,6 +33,40 @@ class CDPBrowserManager:
self.browser: Optional[Browser] = None self.browser: Optional[Browser] = None
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.debug_port: Optional[int] = None self.debug_port: Optional[int] = None
self._cleanup_registered = False
def _register_cleanup_handlers(self):
"""
注册清理处理器,确保程序退出时清理浏览器进程
"""
if self._cleanup_registered:
return
def sync_cleanup():
"""同步清理函数用于atexit"""
if self.launcher and self.launcher.browser_process:
utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程")
self.launcher.cleanup()
# 注册atexit清理
atexit.register(sync_cleanup)
# 注册信号处理器
def signal_handler(signum, frame):
"""信号处理器"""
utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程")
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
# 重新引发KeyboardInterrupt以便正常退出流程
if signum == signal.SIGINT:
raise KeyboardInterrupt
# 注册SIGINT (Ctrl+C) 和 SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
self._cleanup_registered = True
utils.logger.info("[CDPBrowserManager] 清理处理器已注册")
async def launch_and_connect( async def launch_and_connect(
self, self,
@@ -52,7 +88,10 @@ class CDPBrowserManager:
# 3. 启动浏览器 # 3. 启动浏览器
await self._launch_browser(browser_path, headless) await self._launch_browser(browser_path, headless)
# 4. 通过CDP连接 # 4. 注册清理处理器(确保异常退出时也能清理)
self._register_cleanup_handlers()
# 5. 通过CDP连接
await self._connect_via_cdp(playwright) await self._connect_via_cdp(playwright)
# 5. 创建浏览器上下文 # 5. 创建浏览器上下文
@@ -285,38 +324,67 @@ class CDPBrowserManager:
return [] return []
return [] return []
async def cleanup(self): async def cleanup(self, force: bool = False):
""" """
清理资源 清理资源
Args:
force: 是否强制清理浏览器进程忽略AUTO_CLOSE_BROWSER配置
""" """
try: try:
# 关闭浏览器上下文 # 关闭浏览器上下文
if self.browser_context: if self.browser_context:
try: try:
await self.browser_context.close() # 检查上下文是否已经关闭
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭") # 尝试获取页面列表,如果失败说明已经关闭
try:
pages = self.browser_context.pages
if pages is not None:
await self.browser_context.close()
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
except:
utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭")
except Exception as context_error: except Exception as context_error:
utils.logger.warning( # 只在错误不是因为已关闭时才记录警告
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}" error_msg = str(context_error).lower()
) if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}")
finally: finally:
self.browser_context = None self.browser_context = None
# 断开浏览器连接 # 断开浏览器连接
if self.browser: if self.browser:
try: try:
await self.browser.close() # 检查浏览器是否仍然连接
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开") if self.browser.is_connected():
await self.browser.close()
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
else:
utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开")
except Exception as browser_error: except Exception as browser_error:
utils.logger.warning( # 只在错误不是因为已关闭时才记录警告
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}" error_msg = str(browser_error).lower()
) if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}")
finally: finally:
self.browser = None self.browser = None
# 关闭浏览器进程(如果配置为自动关闭) # 关闭浏览器进程
if config.AUTO_CLOSE_BROWSER: # force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置
self.launcher.cleanup() # 这用于处理异常退出或手动清理的情况
if force or config.AUTO_CLOSE_BROWSER:
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
else:
utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程")
else: else:
utils.logger.info( utils.logger.info(
"[CDPBrowserManager] 浏览器进程保持运行AUTO_CLOSE_BROWSER=False" "[CDPBrowserManager] 浏览器进程保持运行AUTO_CLOSE_BROWSER=False"