diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index ab87dfc..a9c3262 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -60,6 +60,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): self.timeout = timeout self.headers = headers self._host = "https://api.bilibili.com" + self.cookie_urls = ["https://www.bilibili.com"] self.playwright_page = playwright_page self.cookie_dict = cookie_dict # Initialize proxy pool (from ProxyRefreshMixin) @@ -145,8 +146,11 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): ping_flag = False return ping_flag - async def update_cookies(self, browser_context: BrowserContext): - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None): + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=urls or self.cookie_urls, + ) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 39e8261..c051fd6 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -62,6 +62,7 @@ class BilibiliCrawler(AbstractCrawler): def __init__(self): self.index_url = "https://www.bilibili.com" + self.cookie_urls = [self.index_url] self.user_agent = utils.get_user_agent() self.cdp_manager = None self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh @@ -105,7 +106,10 @@ class BilibiliCrawler(AbstractCrawler): cookie_str=config.COOKIES, ) await login_obj.begin() - await self.bili_client.update_cookies(browser_context=self.browser_context) + await self.bili_client.update_cookies( + browser_context=self.browser_context, + urls=self.cookie_urls, + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": @@ -462,7 +466,10 @@ class BilibiliCrawler(AbstractCrawler): :return: bilibili client """ utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...") - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, + ) bilibili_client_obj = BilibiliClient( proxy=httpx_proxy, headers={ diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index ce98ae7..c9c2818 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -56,6 +56,13 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin): self.timeout = timeout self.headers = headers self._host = "https://www.douyin.com" + self.cookie_urls = [ + "https://douyin.com", + self._host, + "https://creator.douyin.com", + "https://douhot.douyin.com", + "https://live.douyin.com", + ] self.playwright_page = playwright_page self.cookie_dict = cookie_dict # Initialize proxy pool (from ProxyRefreshMixin) @@ -145,11 +152,17 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin): if local_storage.get("HasUserLogin", "") == "1": return True - _, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + _, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=self.cookie_urls, + ) return cookie_dict.get("LOGIN_STATUS") == "1" - async def update_cookies(self, browser_context: BrowserContext): - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None): + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=urls or self.cookie_urls, + ) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 66dfeb4..c0e9372 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -54,6 +54,13 @@ class DouYinCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://www.douyin.com" + self.cookie_urls = [ + "https://douyin.com", + self.index_url, + "https://creator.douyin.com", + "https://douhot.douyin.com", + "https://live.douyin.com", + ] self.cdp_manager = None self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh @@ -100,7 +107,10 @@ class DouYinCrawler(AbstractCrawler): cookie_str=config.COOKIES, ) await login_obj.begin() - await self.dy_client.update_cookies(browser_context=self.browser_context) + await self.dy_client.update_cookies( + browser_context=self.browser_context, + urls=self.cookie_urls, + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": # Search for notes and retrieve their comment information. @@ -298,7 +308,10 @@ class DouYinCrawler(AbstractCrawler): async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient: """Create douyin client""" - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, + ) # type: ignore douyin_client = DouYinClient( proxy=httpx_proxy, headers={ diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 6cd18cb..2de2405 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -56,6 +56,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): self.headers = headers self._host = "https://www.kuaishou.com/graphql" self._rest_host = "https://www.kuaishou.com" + self.cookie_urls = [self._rest_host] self.playwright_page = playwright_page self.cookie_dict = cookie_dict self.graphql = KuaiShouGraphQL() @@ -133,8 +134,11 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): ping_flag = False return ping_flag - async def update_cookies(self, browser_context: BrowserContext): - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None): + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=urls or self.cookie_urls, + ) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index a7c7c27..40e9b4e 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -56,6 +56,7 @@ class KuaishouCrawler(AbstractCrawler): def __init__(self): self.index_url = "https://www.kuaishou.com" + self.cookie_urls = [self.index_url] self.user_agent = utils.get_user_agent() self.cdp_manager = None self.ip_proxy_pool = None # Proxy IP pool, used for automatic proxy refresh @@ -107,7 +108,8 @@ class KuaishouCrawler(AbstractCrawler): ) await login_obj.begin() await self.ks_client.update_cookies( - browser_context=self.browser_context + browser_context=self.browser_context, + urls=self.cookie_urls, ) crawler_type_var.set(config.CRAWLER_TYPE) @@ -296,7 +298,8 @@ class KuaishouCrawler(AbstractCrawler): time.sleep(20) await self.context_page.goto(f"{self.index_url}?isHome=1") await self.ks_client.update_cookies( - browser_context=self.browser_context + browser_context=self.browser_context, + urls=self.cookie_urls, ) async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: @@ -304,8 +307,9 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.info( "[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..." ) - cookie_str, cookie_dict = utils.convert_cookies( - await self.browser_context.cookies() + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, ) ks_client_obj = KuaiShouClient( proxy=httpx_proxy, diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 79cffb5..5315294 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -54,6 +54,7 @@ class BaiduTieBaClient(AbstractApiClient): "Cookie": "", } self._host = "https://tieba.baidu.com" + self.cookie_urls = [self._host] self._page_extractor = TieBaExtractor() self.default_ip_proxy = default_ip_proxy self.playwright_page = playwright_page # Playwright page object @@ -209,7 +210,10 @@ class BaiduTieBaClient(AbstractApiClient): try: # Get cookies from browser and check key login cookies - _, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + _, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=self.cookie_urls, + ) # Baidu Tieba login identifiers: STOKEN or PTOKEN stoken = cookie_dict.get("STOKEN") @@ -227,7 +231,7 @@ class BaiduTieBaClient(AbstractApiClient): utils.logger.error(f"[BaiduTieBaClient.pong] Check login state failed: {e}, assume not logged in") return False - async def update_cookies(self, browser_context: BrowserContext): + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None): """ Update cookies method provided by API client, usually called after successful login Args: @@ -236,7 +240,10 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=urls or self.cookie_urls, + ) self.headers["Cookie"] = cookie_str utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated") diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 74d4816..8fbdb0a 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -54,6 +54,7 @@ class TieBaCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://tieba.baidu.com" + self.cookie_urls = [self.index_url] self.user_agent = utils.get_user_agent() self._page_extractor = TieBaExtractor() self.cdp_manager = None @@ -123,7 +124,10 @@ class TieBaCrawler(AbstractCrawler): cookie_str=config.COOKIES, ) await login_obj.begin() - await self.tieba_client.update_cookies(browser_context=self.browser_context) + await self.tieba_client.update_cookies( + browser_context=self.browser_context, + urls=self.cookie_urls, + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": @@ -560,7 +564,10 @@ class TieBaCrawler(AbstractCrawler): user_agent = await self.context_page.evaluate("() => navigator.userAgent") utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}") - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, + ) # Build complete browser request headers, simulating real browser behavior tieba_client = BaiduTieBaClient( diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index dac7de1..21f8d1c 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -62,6 +62,7 @@ class WeiboClient(ProxyRefreshMixin): self.timeout = timeout self.headers = headers self._host = "https://m.weibo.cn" + self.cookie_urls = [self._host] self.playwright_page = playwright_page self.cookie_dict = cookie_dict self._image_agent_host = "https://i1.wp.com/" @@ -137,17 +138,16 @@ class WeiboClient(ProxyRefreshMixin): :param urls: Optional list of URLs to filter cookies (e.g., ["https://m.weibo.cn"]) If provided, only cookies for these URLs will be retrieved """ - if urls: - cookies = await browser_context.cookies(urls=urls) - utils.logger.info(f"[WeiboClient.update_cookies] Updating cookies for specific URLs: {urls}") - else: - cookies = await browser_context.cookies() - utils.logger.info("[WeiboClient.update_cookies] Updating all cookies") - - cookie_str, cookie_dict = utils.convert_cookies(cookies) + cookie_urls = urls or self.cookie_urls + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=cookie_urls, + ) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict - utils.logger.info(f"[WeiboClient.update_cookies] Cookie updated successfully, total: {len(cookie_dict)} cookies") + utils.logger.info( + f"[WeiboClient.update_cookies] Cookie updated successfully for {cookie_urls}, total: {len(cookie_dict)} cookies" + ) async def get_note_by_keyword( self, diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 8cb56e3..4f34da5 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -60,6 +60,7 @@ class WeiboCrawler(AbstractCrawler): def __init__(self): self.index_url = "https://www.weibo.com" self.mobile_index_url = "https://m.weibo.cn" + self.cookie_urls = [self.mobile_index_url] self.user_agent = utils.get_user_agent() self.mobile_user_agent = utils.get_mobile_user_agent() self.cdp_manager = None @@ -116,7 +117,7 @@ class WeiboCrawler(AbstractCrawler): # Only get mobile cookies to avoid confusion between PC and mobile cookies await self.wb_client.update_cookies( browser_context=self.browser_context, - urls=[self.mobile_index_url] + urls=self.cookie_urls, ) crawler_type_var.set(config.CRAWLER_TYPE) @@ -338,7 +339,10 @@ class WeiboCrawler(AbstractCrawler): async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient: """Create xhs client""" utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...") - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies(urls=[self.mobile_index_url])) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, + ) weibo_client_obj = WeiboClient( proxy=httpx_proxy, headers={ diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 0630e28..1e07b08 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -63,6 +63,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): else: self._host = "https://edith.xiaohongshu.com" self._domain = "https://www.xiaohongshu.com" + self.cookie_urls = [self._domain] self.IP_ERROR_STR = "Network connection error, please check network settings or restart" self.IP_ERROR_CODE = 300012 self.NOTE_NOT_FOUND_CODE = -510000 @@ -260,7 +261,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): utils.logger.info(f"[XiaoHongShuClient.pong] Login state result: {ping_flag}") return ping_flag - async def update_cookies(self, browser_context: BrowserContext): + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None): """ Update cookies method provided by API client, usually called after successful login Args: @@ -269,7 +270,10 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): Returns: """ - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=urls or self.cookie_urls, + ) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index ebd8f1b..83e0b0b 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -56,6 +56,7 @@ class XiaoHongShuCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://www.rednote.com" if config.XHS_INTERNATIONAL else "https://www.xiaohongshu.com" + self.cookie_urls = [self.index_url] # self.user_agent = utils.get_user_agent() self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" self.cdp_manager = None @@ -105,7 +106,10 @@ class XiaoHongShuCrawler(AbstractCrawler): cookie_str=config.COOKIES, ) await login_obj.begin() - await self.xhs_client.update_cookies(browser_context=self.browser_context) + await self.xhs_client.update_cookies( + browser_context=self.browser_context, + urls=self.cookie_urls, + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": @@ -356,8 +360,9 @@ class XiaoHongShuCrawler(AbstractCrawler): async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient: """Create Xiaohongshu client""" utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create Xiaohongshu API client ...") - cookie_str, cookie_dict = utils.convert_cookies( - await self.browser_context.cookies(self.index_url) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, ) xhs_client_obj = XiaoHongShuClient( proxy=httpx_proxy, diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py index 4f62cf1..7e11991 100644 --- a/media_platform/zhihu/client.py +++ b/media_platform/zhihu/client.py @@ -59,6 +59,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin): self.proxy = proxy self.timeout = timeout self.default_headers = headers + self.cookie_urls = ["https://www.zhihu.com"] self.cookie_dict = cookie_dict self._extractor = ZhihuExtractor() # Initialize proxy pool (from ProxyRefreshMixin) @@ -160,7 +161,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin): ping_flag = False return ping_flag - async def update_cookies(self, browser_context: BrowserContext): + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[list[str]] = None): """ Update cookies method provided by API client, typically called after successful login Args: @@ -169,7 +170,10 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin): Returns: """ - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=urls or self.cookie_urls, + ) self.default_headers["cookie"] = cookie_str self.cookie_dict = cookie_dict diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py index 2f73161..a7ea9df 100644 --- a/media_platform/zhihu/core.py +++ b/media_platform/zhihu/core.py @@ -57,6 +57,7 @@ class ZhihuCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://www.zhihu.com" + self.cookie_urls = [self.index_url] # self.user_agent = utils.get_user_agent() self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" self._extractor = ZhihuExtractor() @@ -114,7 +115,8 @@ class ZhihuCrawler(AbstractCrawler): ) await login_obj.begin() await self.zhihu_client.update_cookies( - browser_context=self.browser_context + browser_context=self.browser_context, + urls=self.cookie_urls, ) # Zhihu's search API requires opening the search page first to access cookies, homepage alone won't work @@ -125,7 +127,10 @@ class ZhihuCrawler(AbstractCrawler): f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content" ) await asyncio.sleep(5) - await self.zhihu_client.update_cookies(browser_context=self.browser_context) + await self.zhihu_client.update_cookies( + browser_context=self.browser_context, + urls=self.cookie_urls, + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": @@ -393,8 +398,9 @@ class ZhihuCrawler(AbstractCrawler): utils.logger.info( "[ZhihuCrawler.create_zhihu_client] Begin create zhihu API client ..." ) - cookie_str, cookie_dict = utils.convert_cookies( - await self.browser_context.cookies() + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + self.browser_context, + urls=self.cookie_urls, ) zhihu_client_obj = ZhiHuClient( proxy=httpx_proxy, diff --git a/test/test_utils.py b/test/test_utils.py index f2f0462..cd0b08a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,10 @@ # -*- coding: utf-8 -*- +from unittest.mock import AsyncMock + +import pytest + from tools import utils @@ -28,3 +32,18 @@ def test_convert_cookies(): cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies) assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx" assert cookie_dict.get("a1") == "x000101360" + + +@pytest.mark.asyncio +async def test_convert_browser_context_cookies_uses_url_filter(): + browser_context = AsyncMock() + browser_context.cookies.return_value = [{"name": "sessionid", "value": "abc"}] + + cookie_str, cookie_dict = await utils.convert_browser_context_cookies( + browser_context, + urls=["https://www.douyin.com"], + ) + + browser_context.cookies.assert_awaited_once_with(urls=["https://www.douyin.com"]) + assert cookie_str == "sessionid=abc" + assert cookie_dict == {"sessionid": "abc"} diff --git a/tools/crawler_util.py b/tools/crawler_util.py index aca38d5..c816377 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -34,7 +34,7 @@ from typing import Dict, List, Optional, Tuple, cast import httpx from PIL import Image, ImageDraw, ImageShow -from playwright.async_api import Cookie, Page +from playwright.async_api import BrowserContext, Cookie, Page from . import utils from .httpx_util import make_async_client @@ -145,6 +145,17 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]: return cookies_str, cookie_dict +async def convert_browser_context_cookies( + browser_context: BrowserContext, urls: Optional[List[str]] = None +) -> Tuple[str, Dict]: + cookies = ( + await browser_context.cookies(urls=urls) + if urls + else await browser_context.cookies() + ) + return convert_cookies(cookies) + + def convert_str_cookie_to_dict(cookie_str: str) -> Dict: cookie_dict: Dict[str, str] = dict() if not cookie_str: