refactor: xhs update

2026-05-30 14:37:25 +08:00 · 2025-07-21 21:26:16 +08:00
parent 26a43358cb
commit a4d9aaa34a
5 changed files with 44 additions and 78 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -39,7 +39,7 @@ SAVE_LOGIN_STATE = True
 # 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取，提供更好的反检测能力
 # 启用后将自动检测并启动用户的Chrome/Edge浏览器，通过CDP协议进行控制
 # 这种方式使用真实的浏览器环境，包括用户的扩展、Cookie和设置，大大降低被检测的风险
-ENABLE_CDP_MODE = False
+ENABLE_CDP_MODE = True

 # CDP调试端口，用于与浏览器通信
 # 如果端口被占用，系统会自动尝试下一个可用端口
--- a/config/xhs_config.py
+++ b/config/xhs_config.py
@@ -15,9 +15,6 @@
 # 排序方式，具体的枚举值在media_platform/xhs/field.py中
 SORT_TYPE = "popularity_descending"

-# 用户代理，xhs自定义User-Agent
-UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
-
 # 指定笔记URL列表, 必须要携带xsec_token参数
 XHS_SPECIFIED_NOTE_URL_LIST = [
    "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
--- a/main.py
+++ b/main.py
@@ -67,7 +67,8 @@ async def main():

 def cleanup():
    if crawler:
-        asyncio.run(crawler.close())
+        # asyncio.run(crawler.close())
+        pass
    if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
        asyncio.run(db.close())

--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -95,7 +95,6 @@ class XiaoHongShuClient(AbstractApiClient):
        """
        # return response.text
        return_response = kwargs.pop("return_response", False)
-
        async with httpx.AsyncClient(proxies=self.proxies) as client:
            response = await client.request(method, url, timeout=self.timeout, **kwargs)

@@ -103,9 +102,9 @@ class XiaoHongShuClient(AbstractApiClient):
            # someday someone maybe will bypass captcha
            verify_type = response.headers["Verifytype"]
            verify_uuid = response.headers["Verifyuuid"]
-            raise Exception(
-                f"出现验证码，请求失败，Verifytype: {verify_type}，Verifyuuid: {verify_uuid}, Response: {response}"
-            )
+            msg = f"出现验证码，请求失败，Verifytype: {verify_type}，Verifyuuid: {verify_uuid}, Response: {response}"
+            utils.logger.error(msg)
+            raise Exception(msg)

        if return_response:
            return response.text
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
    def __init__(self) -> None:
        self.index_url = "https://www.xiaohongshu.com"
        # self.user_agent = utils.get_user_agent()
-        self.user_agent = (
-            config.UA
-            if config.UA
-            else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
-        )
+        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
        self.cdp_manager = None

    async def start(self) -> None:
@@ -91,17 +87,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
                )
            # stealth.min.js is a js script to prevent the website from detecting the crawler.
            await self.browser_context.add_init_script(path="libs/stealth.min.js")
-            # add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
-            await self.browser_context.add_cookies(
-                [
-                    {
-                        "name": "webId",
-                        "value": "xxx123",  # any value
-                        "domain": ".xiaohongshu.com",
-                        "path": "/",
-                    }
-                ]
-            )
            self.context_page = await self.browser_context.new_page()
            await self.context_page.goto(self.index_url)

@@ -152,7 +137,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            page = 1
            search_id = get_search_id()
            while (
-                page - start_page + 1
+                    page - start_page + 1
            ) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                if page < start_page:
                    utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
@@ -294,11 +279,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
        await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)

    async def get_note_detail_async_task(
-        self,
-        note_id: str,
-        xsec_source: str,
-        xsec_token: str,
-        semaphore: asyncio.Semaphore,
+            self,
+            note_id: str,
+            xsec_source: str,
+            xsec_token: str,
+            semaphore: asyncio.Semaphore,
    ) -> Optional[Dict]:
        """Get note detail

@@ -311,47 +296,31 @@ class XiaoHongShuCrawler(AbstractCrawler):
        Returns:
            Dict: note detail
        """
-        note_detail_from_html, note_detail_from_api = None, None
+        note_detail = None
        async with semaphore:
-            # When proxy is not enabled, increase the crawling interval
-            if config.ENABLE_IP_PROXY:
-                crawl_interval = random.random()
-            else:
-                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
            try:
                utils.logger.info(
                    f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
                )
-                # 尝试直接获取网页版笔记详情，携带cookie
-                note_detail_from_html: Optional[Dict] = (
-                    await self.xhs_client.get_note_by_id_from_html(
-                        note_id, xsec_source, xsec_token, enable_cookie=True
+
+                try:
+                    note_detail = await self.xhs_client.get_note_by_id(
+                        note_id, xsec_source, xsec_token
                    )
+                except RetryError as e:
+                    pass
+
+                if not note_detail:
+                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
+                                                                                 enable_cookie=False)
+                    if not note_detail:
+                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+
+                note_detail.update(
+                    {"xsec_token": xsec_token, "xsec_source": xsec_source}
                )
-                time.sleep(crawl_interval)
-                if not note_detail_from_html:
-                    # 如果网页版笔记详情获取失败，则尝试不使用cookie获取
-                    note_detail_from_html = (
-                        await self.xhs_client.get_note_by_id_from_html(
-                            note_id, xsec_source, xsec_token, enable_cookie=False
-                        )
-                    )
-                    utils.logger.error(
-                        f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}"
-                    )
-                if not note_detail_from_html:
-                    # 如果网页版笔记详情获取失败，则尝试API获取
-                    note_detail_from_api: Optional[Dict] = (
-                        await self.xhs_client.get_note_by_id(
-                            note_id, xsec_source, xsec_token
-                        )
-                    )
-                note_detail = note_detail_from_html or note_detail_from_api
-                if note_detail:
-                    note_detail.update(
-                        {"xsec_token": xsec_token, "xsec_source": xsec_source}
-                    )
-                    return note_detail
+                return note_detail
+
            except DataFetchError as ex:
                utils.logger.error(
                    f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
@@ -364,7 +333,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
                return None

    async def batch_get_note_comments(
-        self, note_list: List[str], xsec_tokens: List[str]
+            self, note_list: List[str], xsec_tokens: List[str]
    ):
        """Batch get note comments"""
        if not config.ENABLE_GET_COMMENTS:
@@ -389,7 +358,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
        await asyncio.gather(*task_list)

    async def get_comments(
-        self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
+            self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
    ):
        """Get note comments with keyword filtering and quantity limitation"""
        async with semaphore:
@@ -411,7 +380,7 @@ class XiaoHongShuCrawler(AbstractCrawler):

    @staticmethod
    def format_proxy_info(
-        ip_proxy_info: IpInfoModel,
+            ip_proxy_info: IpInfoModel,
    ) -> Tuple[Optional[Dict], Optional[Dict]]:
        """format proxy info for playwright and httpx"""
        playwright_proxy = {
@@ -447,11 +416,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
        return xhs_client_obj

    async def launch_browser(
-        self,
-        chromium: BrowserType,
-        playwright_proxy: Optional[Dict],
-        user_agent: Optional[str],
-        headless: bool = True,
+            self,
+            chromium: BrowserType,
+            playwright_proxy: Optional[Dict],
+            user_agent: Optional[str],
+            headless: bool = True,
    ) -> BrowserContext:
        """Launch browser and create browser context"""
        utils.logger.info(
@@ -480,11 +449,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
            return browser_context

    async def launch_browser_with_cdp(
-        self,
-        playwright: Playwright,
-        playwright_proxy: Optional[Dict],
-        user_agent: Optional[str],
-        headless: bool = True,
+            self,
+            playwright: Playwright,
+            playwright_proxy: Optional[Dict],
+            user_agent: Optional[str],
+            headless: bool = True,
    ) -> BrowserContext:
        """
        使用CDP模式启动浏览器