mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-10 12:37:37 +08:00
fix: 避免复用浏览器时跨域 Cookie 过长导致请求失败
连接已有 Chrome 会把整个浏览器上下文的 cookie 带入平台 client。 除 xhs 外,多数平台仍直接读取全量 cookies,导致请求头过长并放大跨域污染。 本次将各平台的 cookie 读取统一收口到平台域名,并补上基础回归测试。 Constraint: 必须继续复用用户真实浏览器里的平台登录态 Rejected: 仅修复 xhs | 其他平台在连接已有浏览器时仍会携带超长 Cookie Confidence: high Scope-risk: moderate Reversibility: clean Directive: 后续新增平台或调整 update_cookies 和 create client 流程时,只按平台域名读取 cookies Tested: uv run pytest test/test_utils.py; python3 -m compileall tools/crawler_util.py media_platform/douyin/core.py media_platform/douyin/client.py media_platform/kuaishou/core.py media_platform/kuaishou/client.py media_platform/bilibili/core.py media_platform/bilibili/client.py media_platform/zhihu/core.py media_platform/zhihu/client.py media_platform/tieba/core.py media_platform/tieba/client.py media_platform/xhs/core.py media_platform/xhs/client.py media_platform/weibo/core.py media_platform/weibo/client.py test/test_utils.py Not-tested: 各平台在真实 CDP 浏览器连接下的端到端抓取流程
This commit is contained in:
@@ -60,6 +60,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://api.bilibili.com"
|
||||
self.cookie_urls = ["https://www.bilibili.com"]
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
@@ -145,8 +146,11 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
|
||||
@@ -62,6 +62,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.cookie_urls = [self.index_url]
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
@@ -105,7 +106,10 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.bili_client.update_cookies(browser_context=self.browser_context)
|
||||
await self.bili_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
@@ -462,7 +466,10 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
:return: bilibili client
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
bilibili_client_obj = BilibiliClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
|
||||
@@ -56,6 +56,13 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.douyin.com"
|
||||
self.cookie_urls = [
|
||||
"https://douyin.com",
|
||||
self._host,
|
||||
"https://creator.douyin.com",
|
||||
"https://douhot.douyin.com",
|
||||
"https://live.douyin.com",
|
||||
]
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
@@ -145,11 +152,17 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
|
||||
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
_, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
return cookie_dict.get("LOGIN_STATUS") == "1"
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
|
||||
@@ -54,6 +54,13 @@ class DouYinCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.cookie_urls = [
|
||||
"https://douyin.com",
|
||||
self.index_url,
|
||||
"https://creator.douyin.com",
|
||||
"https://douhot.douyin.com",
|
||||
"https://live.douyin.com",
|
||||
]
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||||
|
||||
@@ -100,7 +107,10 @@ class DouYinCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
await self.dy_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
@@ -298,7 +308,10 @@ class DouYinCrawler(AbstractCrawler):
|
||||
|
||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
|
||||
"""Create douyin client"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
) # type: ignore
|
||||
douyin_client = DouYinClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
|
||||
@@ -56,6 +56,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.headers = headers
|
||||
self._host = "https://www.kuaishou.com/graphql"
|
||||
self._rest_host = "https://www.kuaishou.com"
|
||||
self.cookie_urls = [self._rest_host]
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self.graphql = KuaiShouGraphQL()
|
||||
@@ -133,8 +134,11 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.kuaishou.com"
|
||||
self.cookie_urls = [self.index_url]
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # Proxy IP pool, used for automatic proxy refresh
|
||||
@@ -107,7 +108,8 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.ks_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
@@ -296,7 +298,8 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
time.sleep(20)
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
await self.ks_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
||||
@@ -304,8 +307,9 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
"[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..."
|
||||
)
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
ks_client_obj = KuaiShouClient(
|
||||
proxy=httpx_proxy,
|
||||
|
||||
@@ -54,6 +54,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
"Cookie": "",
|
||||
}
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self.cookie_urls = [self._host]
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
self.playwright_page = playwright_page # Playwright page object
|
||||
@@ -209,7 +210,10 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
try:
|
||||
# Get cookies from browser and check key login cookies
|
||||
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
_, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
# Baidu Tieba login identifiers: STOKEN or PTOKEN
|
||||
stoken = cookie_dict.get("STOKEN")
|
||||
@@ -227,7 +231,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.error(f"[BaiduTieBaClient.pong] Check login state failed: {e}, assume not logged in")
|
||||
return False
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
"""
|
||||
Update cookies method provided by API client, usually called after successful login
|
||||
Args:
|
||||
@@ -236,7 +240,10 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://tieba.baidu.com"
|
||||
self.cookie_urls = [self.index_url]
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.cdp_manager = None
|
||||
@@ -123,7 +124,10 @@ class TieBaCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.tieba_client.update_cookies(browser_context=self.browser_context)
|
||||
await self.tieba_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
@@ -560,7 +564,10 @@ class TieBaCrawler(AbstractCrawler):
|
||||
user_agent = await self.context_page.evaluate("() => navigator.userAgent")
|
||||
utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")
|
||||
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
# Build complete browser request headers, simulating real browser behavior
|
||||
tieba_client = BaiduTieBaClient(
|
||||
|
||||
@@ -62,6 +62,7 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://m.weibo.cn"
|
||||
self.cookie_urls = [self._host]
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
@@ -137,17 +138,16 @@ class WeiboClient(ProxyRefreshMixin):
|
||||
:param urls: Optional list of URLs to filter cookies (e.g., ["https://m.weibo.cn"])
|
||||
If provided, only cookies for these URLs will be retrieved
|
||||
"""
|
||||
if urls:
|
||||
cookies = await browser_context.cookies(urls=urls)
|
||||
utils.logger.info(f"[WeiboClient.update_cookies] Updating cookies for specific URLs: {urls}")
|
||||
else:
|
||||
cookies = await browser_context.cookies()
|
||||
utils.logger.info("[WeiboClient.update_cookies] Updating all cookies")
|
||||
|
||||
cookie_str, cookie_dict = utils.convert_cookies(cookies)
|
||||
cookie_urls = urls or self.cookie_urls
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
utils.logger.info(f"[WeiboClient.update_cookies] Cookie updated successfully, total: {len(cookie_dict)} cookies")
|
||||
utils.logger.info(
|
||||
f"[WeiboClient.update_cookies] Cookie updated successfully for {cookie_urls}, total: {len(cookie_dict)} cookies"
|
||||
)
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
|
||||
@@ -60,6 +60,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.weibo.com"
|
||||
self.mobile_index_url = "https://m.weibo.cn"
|
||||
self.cookie_urls = [self.mobile_index_url]
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
@@ -116,7 +117,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
# Only get mobile cookies to avoid confusion between PC and mobile cookies
|
||||
await self.wb_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=[self.mobile_index_url]
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
@@ -338,7 +339,10 @@ class WeiboCrawler(AbstractCrawler):
|
||||
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies(urls=[self.mobile_index_url]))
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
weibo_client_obj = WeiboClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
|
||||
@@ -63,6 +63,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
else:
|
||||
self._host = "https://edith.xiaohongshu.com"
|
||||
self._domain = "https://www.xiaohongshu.com"
|
||||
self.cookie_urls = [self._domain]
|
||||
self.IP_ERROR_STR = "Network connection error, please check network settings or restart"
|
||||
self.IP_ERROR_CODE = 300012
|
||||
self.NOTE_NOT_FOUND_CODE = -510000
|
||||
@@ -260,7 +261,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
utils.logger.info(f"[XiaoHongShuClient.pong] Login state result: {ping_flag}")
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
"""
|
||||
Update cookies method provided by API client, usually called after successful login
|
||||
Args:
|
||||
@@ -269,7 +270,10 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.rednote.com" if config.XHS_INTERNATIONAL else "https://www.xiaohongshu.com"
|
||||
self.cookie_urls = [self.index_url]
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||
self.cdp_manager = None
|
||||
@@ -105,7 +106,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||
await self.xhs_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
@@ -356,8 +360,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||
"""Create Xiaohongshu client"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create Xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies(self.index_url)
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
xhs_client_obj = XiaoHongShuClient(
|
||||
proxy=httpx_proxy,
|
||||
|
||||
@@ -59,6 +59,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.default_headers = headers
|
||||
self.cookie_urls = ["https://www.zhihu.com"]
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = ZhihuExtractor()
|
||||
# Initialize proxy pool (from ProxyRefreshMixin)
|
||||
@@ -160,7 +161,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None):
|
||||
"""
|
||||
Update cookies method provided by API client, typically called after successful login
|
||||
Args:
|
||||
@@ -169,7 +170,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=urls or self.cookie_urls,
|
||||
)
|
||||
self.default_headers["cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.zhihu.com"
|
||||
self.cookie_urls = [self.index_url]
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||||
self._extractor = ZhihuExtractor()
|
||||
@@ -114,7 +115,8 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.zhihu_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
# Zhihu's search API requires opening the search page first to access cookies, homepage alone won't work
|
||||
@@ -125,7 +127,10 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
)
|
||||
await asyncio.sleep(5)
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
await self.zhihu_client.update_cookies(
|
||||
browser_context=self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
@@ -393,8 +398,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.create_zhihu_client] Begin create zhihu API client ..."
|
||||
)
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
self.browser_context,
|
||||
urls=self.cookie_urls,
|
||||
)
|
||||
zhihu_client_obj = ZhiHuClient(
|
||||
proxy=httpx_proxy,
|
||||
|
||||
@@ -20,6 +20,10 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
@@ -28,3 +32,18 @@ def test_convert_cookies():
|
||||
cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies)
|
||||
assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx"
|
||||
assert cookie_dict.get("a1") == "x000101360"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_browser_context_cookies_uses_url_filter():
|
||||
browser_context = AsyncMock()
|
||||
browser_context.cookies.return_value = [{"name": "sessionid", "value": "abc"}]
|
||||
|
||||
cookie_str, cookie_dict = await utils.convert_browser_context_cookies(
|
||||
browser_context,
|
||||
urls=["https://www.douyin.com"],
|
||||
)
|
||||
|
||||
browser_context.cookies.assert_awaited_once_with(urls=["https://www.douyin.com"])
|
||||
assert cookie_str == "sessionid=abc"
|
||||
assert cookie_dict == {"sessionid": "abc"}
|
||||
|
||||
@@ -34,7 +34,7 @@ from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
import httpx
|
||||
from PIL import Image, ImageDraw, ImageShow
|
||||
from playwright.async_api import Cookie, Page
|
||||
from playwright.async_api import BrowserContext, Cookie, Page
|
||||
|
||||
from . import utils
|
||||
from .httpx_util import make_async_client
|
||||
@@ -145,6 +145,17 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
|
||||
return cookies_str, cookie_dict
|
||||
|
||||
|
||||
async def convert_browser_context_cookies(
|
||||
browser_context: BrowserContext, urls: Optional[List[str]] = None
|
||||
) -> Tuple[str, Dict]:
|
||||
cookies = (
|
||||
await browser_context.cookies(urls=urls)
|
||||
if urls
|
||||
else await browser_context.cookies()
|
||||
)
|
||||
return convert_cookies(cookies)
|
||||
|
||||
|
||||
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
|
||||
cookie_dict: Dict[str, str] = dict()
|
||||
if not cookie_str:
|
||||
|
||||
Reference in New Issue
Block a user