mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 23:21:33 +08:00
feat: cdp browser cleanup after crawler done
This commit is contained in:
58
main.py
58
main.py
@@ -11,6 +11,7 @@
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import signal
|
||||
from typing import Optional
|
||||
|
||||
import cmd_arg
|
||||
@@ -87,15 +88,62 @@ async def main():
|
||||
print(f"Error generating wordcloud: {e}")
|
||||
|
||||
|
||||
def cleanup():
|
||||
if crawler:
|
||||
pass
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
||||
asyncio.run(db.close())
|
||||
async def async_cleanup():
|
||||
"""异步清理函数,用于处理CDP浏览器等异步资源"""
|
||||
global crawler
|
||||
if crawler:
|
||||
# 检查并清理CDP浏览器
|
||||
if hasattr(crawler, 'cdp_manager') and crawler.cdp_manager:
|
||||
try:
|
||||
await crawler.cdp_manager.cleanup(force=True) # 强制清理浏览器进程
|
||||
except Exception as e:
|
||||
# 只在非预期错误时打印
|
||||
error_msg = str(e).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
print(f"[Main] 清理CDP浏览器时出错: {e}")
|
||||
|
||||
# 检查并清理标准浏览器上下文(仅在非CDP模式下)
|
||||
elif hasattr(crawler, 'browser_context') and crawler.browser_context:
|
||||
try:
|
||||
# 检查上下文是否仍然打开
|
||||
if hasattr(crawler.browser_context, 'pages'):
|
||||
await crawler.browser_context.close()
|
||||
except Exception as e:
|
||||
# 只在非预期错误时打印
|
||||
error_msg = str(e).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
print(f"[Main] 关闭浏览器上下文时出错: {e}")
|
||||
|
||||
# 关闭数据库连接
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
||||
await db.close()
|
||||
|
||||
def cleanup():
|
||||
"""同步清理函数"""
|
||||
try:
|
||||
# 创建新的事件循环来执行异步清理
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(async_cleanup())
|
||||
loop.close()
|
||||
except Exception as e:
|
||||
print(f"[Main] 清理时出错: {e}")
|
||||
|
||||
|
||||
def signal_handler(signum, _frame):
|
||||
"""信号处理器,处理Ctrl+C等中断信号"""
|
||||
print(f"\n[Main] 收到中断信号 {signum},正在清理资源...")
|
||||
cleanup()
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 注册信号处理器
|
||||
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
|
||||
signal.signal(signal.SIGTERM, signal_handler) # 终止信号
|
||||
|
||||
try:
|
||||
asyncio.get_event_loop().run_until_complete(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\n[Main] 收到键盘中断,正在清理资源...")
|
||||
finally:
|
||||
cleanup()
|
||||
|
||||
@@ -13,6 +13,8 @@ import os
|
||||
import asyncio
|
||||
import socket
|
||||
import httpx
|
||||
import signal
|
||||
import atexit
|
||||
from typing import Optional, Dict, Any
|
||||
from playwright.async_api import Browser, BrowserContext, Playwright
|
||||
|
||||
@@ -31,6 +33,40 @@ class CDPBrowserManager:
|
||||
self.browser: Optional[Browser] = None
|
||||
self.browser_context: Optional[BrowserContext] = None
|
||||
self.debug_port: Optional[int] = None
|
||||
self._cleanup_registered = False
|
||||
|
||||
def _register_cleanup_handlers(self):
|
||||
"""
|
||||
注册清理处理器,确保程序退出时清理浏览器进程
|
||||
"""
|
||||
if self._cleanup_registered:
|
||||
return
|
||||
|
||||
def sync_cleanup():
|
||||
"""同步清理函数,用于atexit"""
|
||||
if self.launcher and self.launcher.browser_process:
|
||||
utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程")
|
||||
self.launcher.cleanup()
|
||||
|
||||
# 注册atexit清理
|
||||
atexit.register(sync_cleanup)
|
||||
|
||||
# 注册信号处理器
|
||||
def signal_handler(signum, frame):
|
||||
"""信号处理器"""
|
||||
utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程")
|
||||
if self.launcher and self.launcher.browser_process:
|
||||
self.launcher.cleanup()
|
||||
# 重新引发KeyboardInterrupt以便正常退出流程
|
||||
if signum == signal.SIGINT:
|
||||
raise KeyboardInterrupt
|
||||
|
||||
# 注册SIGINT (Ctrl+C) 和 SIGTERM
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
self._cleanup_registered = True
|
||||
utils.logger.info("[CDPBrowserManager] 清理处理器已注册")
|
||||
|
||||
async def launch_and_connect(
|
||||
self,
|
||||
@@ -52,7 +88,10 @@ class CDPBrowserManager:
|
||||
# 3. 启动浏览器
|
||||
await self._launch_browser(browser_path, headless)
|
||||
|
||||
# 4. 通过CDP连接
|
||||
# 4. 注册清理处理器(确保异常退出时也能清理)
|
||||
self._register_cleanup_handlers()
|
||||
|
||||
# 5. 通过CDP连接
|
||||
await self._connect_via_cdp(playwright)
|
||||
|
||||
# 5. 创建浏览器上下文
|
||||
@@ -285,38 +324,67 @@ class CDPBrowserManager:
|
||||
return []
|
||||
return []
|
||||
|
||||
async def cleanup(self):
|
||||
async def cleanup(self, force: bool = False):
|
||||
"""
|
||||
清理资源
|
||||
|
||||
Args:
|
||||
force: 是否强制清理浏览器进程(忽略AUTO_CLOSE_BROWSER配置)
|
||||
"""
|
||||
try:
|
||||
# 关闭浏览器上下文
|
||||
if self.browser_context:
|
||||
try:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
# 检查上下文是否已经关闭
|
||||
# 尝试获取页面列表,如果失败说明已经关闭
|
||||
try:
|
||||
pages = self.browser_context.pages
|
||||
if pages is not None:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
except:
|
||||
utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭")
|
||||
except Exception as context_error:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
|
||||
)
|
||||
# 只在错误不是因为已关闭时才记录警告
|
||||
error_msg = str(context_error).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
|
||||
)
|
||||
else:
|
||||
utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}")
|
||||
finally:
|
||||
self.browser_context = None
|
||||
|
||||
# 断开浏览器连接
|
||||
if self.browser:
|
||||
try:
|
||||
await self.browser.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
# 检查浏览器是否仍然连接
|
||||
if self.browser.is_connected():
|
||||
await self.browser.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
else:
|
||||
utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开")
|
||||
except Exception as browser_error:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
|
||||
)
|
||||
# 只在错误不是因为已关闭时才记录警告
|
||||
error_msg = str(browser_error).lower()
|
||||
if "closed" not in error_msg and "disconnected" not in error_msg:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
|
||||
)
|
||||
else:
|
||||
utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}")
|
||||
finally:
|
||||
self.browser = None
|
||||
|
||||
# 关闭浏览器进程(如果配置为自动关闭)
|
||||
if config.AUTO_CLOSE_BROWSER:
|
||||
self.launcher.cleanup()
|
||||
# 关闭浏览器进程
|
||||
# force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置
|
||||
# 这用于处理异常退出或手动清理的情况
|
||||
if force or config.AUTO_CLOSE_BROWSER:
|
||||
if self.launcher and self.launcher.browser_process:
|
||||
self.launcher.cleanup()
|
||||
else:
|
||||
utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程")
|
||||
else:
|
||||
utils.logger.info(
|
||||
"[CDPBrowserManager] 浏览器进程保持运行(AUTO_CLOSE_BROWSER=False)"
|
||||
|
||||
Reference in New Issue
Block a user