mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-23 11:07:25 +08:00
fix: 避免复用浏览器时跨域 Cookie 过长导致请求失败
连接已有 Chrome 会把整个浏览器上下文的 cookie 带入平台 client。 除 xhs 外,多数平台仍直接读取全量 cookies,导致请求头过长并放大跨域污染。 本次将各平台的 cookie 读取统一收口到平台域名,并补上基础回归测试。 Constraint: 必须继续复用用户真实浏览器里的平台登录态 Rejected: 仅修复 xhs | 其他平台在连接已有浏览器时仍会携带超长 Cookie Confidence: high Scope-risk: moderate Reversibility: clean Directive: 后续新增平台或调整 update_cookies 和 create client 流程时,只按平台域名读取 cookies Tested: uv run pytest test/test_utils.py; python3 -m compileall tools/crawler_util.py media_platform/douyin/core.py media_platform/douyin/client.py media_platform/kuaishou/core.py media_platform/kuaishou/client.py media_platform/bilibili/core.py media_platform/bilibili/client.py media_platform/zhihu/core.py media_platform/zhihu/client.py media_platform/tieba/core.py media_platform/tieba/client.py media_platform/xhs/core.py media_platform/xhs/client.py media_platform/weibo/core.py media_platform/weibo/client.py test/test_utils.py Not-tested: 各平台在真实 CDP 浏览器连接下的端到端抓取流程
This commit is contained in:
@@ -34,7 +34,7 @@ from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
import httpx
|
||||
from PIL import Image, ImageDraw, ImageShow
|
||||
from playwright.async_api import Cookie, Page
|
||||
from playwright.async_api import BrowserContext, Cookie, Page
|
||||
|
||||
from . import utils
|
||||
from .httpx_util import make_async_client
|
||||
@@ -145,6 +145,17 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
|
||||
return cookies_str, cookie_dict
|
||||
|
||||
|
||||
async def convert_browser_context_cookies(
|
||||
browser_context: BrowserContext, urls: Optional[List[str]] = None
|
||||
) -> Tuple[str, Dict]:
|
||||
cookies = (
|
||||
await browser_context.cookies(urls=urls)
|
||||
if urls
|
||||
else await browser_context.cookies()
|
||||
)
|
||||
return convert_cookies(cookies)
|
||||
|
||||
|
||||
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
|
||||
cookie_dict: Dict[str, str] = dict()
|
||||
if not cookie_str:
|
||||
|
||||
Reference in New Issue
Block a user