diff --git a/main.py b/main.py index b5ce740..f000496 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,7 @@ import asyncio import sys +import signal from typing import Optional import cmd_arg @@ -87,15 +88,62 @@ async def main(): print(f"Error generating wordcloud: {e}") -def cleanup(): - if crawler: - pass - if config.SAVE_DATA_OPTION in ["db", "sqlite"]: - asyncio.run(db.close()) +async def async_cleanup(): + """异步清理函数,用于处理CDP浏览器等异步资源""" + global crawler + if crawler: + # 检查并清理CDP浏览器 + if hasattr(crawler, 'cdp_manager') and crawler.cdp_manager: + try: + await crawler.cdp_manager.cleanup(force=True) # 强制清理浏览器进程 + except Exception as e: + # 只在非预期错误时打印 + error_msg = str(e).lower() + if "closed" not in error_msg and "disconnected" not in error_msg: + print(f"[Main] 清理CDP浏览器时出错: {e}") + # 检查并清理标准浏览器上下文(仅在非CDP模式下) + elif hasattr(crawler, 'browser_context') and crawler.browser_context: + try: + # 检查上下文是否仍然打开 + if hasattr(crawler.browser_context, 'pages'): + await crawler.browser_context.close() + except Exception as e: + # 只在非预期错误时打印 + error_msg = str(e).lower() + if "closed" not in error_msg and "disconnected" not in error_msg: + print(f"[Main] 关闭浏览器上下文时出错: {e}") + + # 关闭数据库连接 + if config.SAVE_DATA_OPTION in ["db", "sqlite"]: + await db.close() + +def cleanup(): + """同步清理函数""" + try: + # 创建新的事件循环来执行异步清理 + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(async_cleanup()) + loop.close() + except Exception as e: + print(f"[Main] 清理时出错: {e}") + + +def signal_handler(signum, _frame): + """信号处理器,处理Ctrl+C等中断信号""" + print(f"\n[Main] 收到中断信号 {signum},正在清理资源...") + cleanup() + sys.exit(0) if __name__ == "__main__": + # 注册信号处理器 + signal.signal(signal.SIGINT, signal_handler) # Ctrl+C + signal.signal(signal.SIGTERM, signal_handler) # 终止信号 + try: asyncio.get_event_loop().run_until_complete(main()) + except KeyboardInterrupt: + print("\n[Main] 收到键盘中断,正在清理资源...") finally: cleanup() diff --git a/tools/cdp_browser.py b/tools/cdp_browser.py index 3a29237..012dc31 100644 --- a/tools/cdp_browser.py +++ b/tools/cdp_browser.py @@ -13,6 +13,8 @@ import os import asyncio import socket import httpx +import signal +import atexit from typing import Optional, Dict, Any from playwright.async_api import Browser, BrowserContext, Playwright @@ -31,6 +33,40 @@ class CDPBrowserManager: self.browser: Optional[Browser] = None self.browser_context: Optional[BrowserContext] = None self.debug_port: Optional[int] = None + self._cleanup_registered = False + + def _register_cleanup_handlers(self): + """ + 注册清理处理器,确保程序退出时清理浏览器进程 + """ + if self._cleanup_registered: + return + + def sync_cleanup(): + """同步清理函数,用于atexit""" + if self.launcher and self.launcher.browser_process: + utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程") + self.launcher.cleanup() + + # 注册atexit清理 + atexit.register(sync_cleanup) + + # 注册信号处理器 + def signal_handler(signum, frame): + """信号处理器""" + utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程") + if self.launcher and self.launcher.browser_process: + self.launcher.cleanup() + # 重新引发KeyboardInterrupt以便正常退出流程 + if signum == signal.SIGINT: + raise KeyboardInterrupt + + # 注册SIGINT (Ctrl+C) 和 SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + self._cleanup_registered = True + utils.logger.info("[CDPBrowserManager] 清理处理器已注册") async def launch_and_connect( self, @@ -52,7 +88,10 @@ class CDPBrowserManager: # 3. 启动浏览器 await self._launch_browser(browser_path, headless) - # 4. 通过CDP连接 + # 4. 注册清理处理器(确保异常退出时也能清理) + self._register_cleanup_handlers() + + # 5. 通过CDP连接 await self._connect_via_cdp(playwright) # 5. 创建浏览器上下文 @@ -285,38 +324,67 @@ class CDPBrowserManager: return [] return [] - async def cleanup(self): + async def cleanup(self, force: bool = False): """ 清理资源 + + Args: + force: 是否强制清理浏览器进程(忽略AUTO_CLOSE_BROWSER配置) """ try: # 关闭浏览器上下文 if self.browser_context: try: - await self.browser_context.close() - utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭") + # 检查上下文是否已经关闭 + # 尝试获取页面列表,如果失败说明已经关闭 + try: + pages = self.browser_context.pages + if pages is not None: + await self.browser_context.close() + utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭") + except: + utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭") except Exception as context_error: - utils.logger.warning( - f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}" - ) + # 只在错误不是因为已关闭时才记录警告 + error_msg = str(context_error).lower() + if "closed" not in error_msg and "disconnected" not in error_msg: + utils.logger.warning( + f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}" + ) + else: + utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}") finally: self.browser_context = None # 断开浏览器连接 if self.browser: try: - await self.browser.close() - utils.logger.info("[CDPBrowserManager] 浏览器连接已断开") + # 检查浏览器是否仍然连接 + if self.browser.is_connected(): + await self.browser.close() + utils.logger.info("[CDPBrowserManager] 浏览器连接已断开") + else: + utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开") except Exception as browser_error: - utils.logger.warning( - f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}" - ) + # 只在错误不是因为已关闭时才记录警告 + error_msg = str(browser_error).lower() + if "closed" not in error_msg and "disconnected" not in error_msg: + utils.logger.warning( + f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}" + ) + else: + utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}") finally: self.browser = None - # 关闭浏览器进程(如果配置为自动关闭) - if config.AUTO_CLOSE_BROWSER: - self.launcher.cleanup() + # 关闭浏览器进程 + # force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置 + # 这用于处理异常退出或手动清理的情况 + if force or config.AUTO_CLOSE_BROWSER: + if self.launcher and self.launcher.browser_process: + self.launcher.cleanup() + else: + utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程") else: utils.logger.info( "[CDPBrowserManager] 浏览器进程保持运行(AUTO_CLOSE_BROWSER=False)"