mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-27 21:17:27 +08:00
fix: 避免复用浏览器时跨域 Cookie 过长导致请求失败
连接已有 Chrome 会把整个浏览器上下文的 cookie 带入平台 client。 除 xhs 外,多数平台仍直接读取全量 cookies,导致请求头过长并放大跨域污染。 本次将各平台的 cookie 读取统一收口到平台域名,并补上基础回归测试。 Constraint: 必须继续复用用户真实浏览器里的平台登录态 Rejected: 仅修复 xhs | 其他平台在连接已有浏览器时仍会携带超长 Cookie Confidence: high Scope-risk: moderate Reversibility: clean Directive: 后续新增平台或调整 update_cookies 和 create client 流程时,只按平台域名读取 cookies Tested: uv run pytest test/test_utils.py; python3 -m compileall tools/crawler_util.py media_platform/douyin/core.py media_platform/douyin/client.py media_platform/kuaishou/core.py media_platform/kuaishou/client.py media_platform/bilibili/core.py media_platform/bilibili/client.py media_platform/zhihu/core.py media_platform/zhihu/client.py media_platform/tieba/core.py media_platform/tieba/client.py media_platform/xhs/core.py media_platform/xhs/client.py media_platform/weibo/core.py media_platform/weibo/client.py test/test_utils.py Not-tested: 各平台在真实 CDP 浏览器连接下的端到端抓取流程
This commit is contained in:
@@ -60,6 +60,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://api.bilibili.com"
|
||||
self.cookie_urls = ["https://www.bilibili.com"]
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
@@ -145,8 +146,11 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
|
||||
@@ -62,6 +62,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.cookie_urls = [self.index_url]
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
@@ -105,7 +106,10 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.bili_client.update_cookies(browser_context=self.browser_context)
|
||||
await self.bili_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
@@ -462,7 +466,10 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
:return: bilibili client
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
bilibili_client_obj = BilibiliClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
|
||||
Reference in New Issue
Block a user