feat: 支持playwright通过cdp协议连接本地chrome浏览器

docs: 增加uv来管理python依赖的文档
This commit is contained in:
程序员阿江(Relakkes)
2025-06-25 23:22:39 +08:00
parent fbc9788d54
commit e83b2422d9
10 changed files with 991 additions and 52 deletions

View File

@@ -16,7 +16,7 @@ import time
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from playwright.async_api import BrowserContext, BrowserType, Page, async_playwright
from playwright.async_api import BrowserContext, BrowserType, Page, Playwright, async_playwright
from tenacity import RetryError
import config
@@ -26,6 +26,7 @@ from model.m_xiaohongshu import NoteUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from var import crawler_type_var, source_keyword_var
from .client import XiaoHongShuClient
@@ -39,11 +40,13 @@ class XiaoHongShuCrawler(AbstractCrawler):
context_page: Page
xhs_client: XiaoHongShuClient
browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager]
def __init__(self) -> None:
self.index_url = "https://www.xiaohongshu.com"
# self.user_agent = utils.get_user_agent()
self.user_agent = config.UA if config.UA else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
self.cdp_manager = None
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
@@ -57,11 +60,20 @@ class XiaoHongShuCrawler(AbstractCrawler):
)
async with async_playwright() as playwright:
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium, None, self.user_agent, headless=config.HEADLESS
)
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright, playwright_proxy_format, self.user_agent,
headless=config.CDP_HEADLESS
)
else:
utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium, playwright_proxy_format, self.user_agent, headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
# add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
@@ -292,6 +304,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
try:
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
# 尝试直接获取网页版笔记详情携带cookie
note_detail_from_html: Optional[Dict] = (
await self.xhs_client.get_note_by_id_from_html(
@@ -449,9 +462,40 @@ class XiaoHongShuCrawler(AbstractCrawler):
)
return browser_context
async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict],
user_agent: Optional[str], headless: bool = True) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
try:
self.cdp_manager = CDPBrowserManager()
browser_context = await self.cdp_manager.launch_and_connect(
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless
)
# 显示浏览器信息
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[XiaoHongShuCrawler] CDP浏览器信息: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败回退到标准模式: {e}")
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
async def close(self):
"""Close browser context"""
await self.browser_context.close()
# 如果使用CDP模式需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
async def get_notice_media(self, note_detail: Dict):