feat: 小红书增加手机号自动登录模式

2026-06-09 03:17:25 +08:00 · 2023-06-16 19:35:43 +08:00
parent c0a783fa35
commit 8206f83639
11 changed files with 347 additions and 71 deletions
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -163,13 +163,15 @@ class XHSClient:
        }
        return await self.get(uri, params)

-    async def get_note_all_comments(self, note_id: str, crawl_interval: int = 1):
-        """get note all comments include sub comments
-
-        :param crawl_interval:
-        :param note_id: note id you want to fetch
-        :type note_id: str
+    async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False):
        """
+        get note all comments include sub comments
+        :param note_id:
+        :param crawl_interval:
+        :param is_fetch_sub_comments:
+        :return:
+        """
+
        result = []
        comments_has_more = True
        comments_cursor = ""
@@ -178,6 +180,10 @@ class XHSClient:
            comments_has_more = comments_res.get("has_more", False)
            comments_cursor = comments_res.get("cursor", "")
            comments = comments_res["comments"]
+            if not is_fetch_sub_comments:
+                result.extend(comments)
+                continue
+            # handle get sub comments
            for comment in comments:
                result.append(comment)
                cur_sub_comment_count = int(comment["sub_comment_count"])
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -1,32 +1,45 @@
 import sys
+import random
 import asyncio
+from asyncio import Task
 from typing import Optional, List, Dict

+import aioredis
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_fixed,
+    retry_if_result
+)
 from playwright.async_api import Page
 from playwright.async_api import Cookie
 from playwright.async_api import BrowserContext
 from playwright.async_api import async_playwright

 import utils
+import config
 from .client import XHSClient
 from base_crawler import Crawler
+from models import xhs as xhs_model


 class XiaoHongShuCrawler(Crawler):
    def __init__(self):
+        self.login_phone = None
+        self.login_type = None
        self.keywords = None
-        self.scan_qrcode_time = None
        self.cookies: Optional[List[Cookie]] = None
        self.browser_context: Optional[BrowserContext] = None
        self.context_page: Optional[Page] = None
        self.proxy: Optional[Dict] = None
        self.user_agent = utils.get_user_agent()
        self.xhs_client: Optional[XHSClient] = None
-        self.login_url = "https://www.xiaohongshu.com"
-        self.scan_qrcode_time = 20  # second
+        self.index_url = "https://www.xiaohongshu.com"

    def init_config(self, **kwargs):
        self.keywords = kwargs.get("keywords")
+        self.login_type = kwargs.get("login_type")
+        self.login_phone = kwargs.get("login_phone")

    async def update_cookies(self):
        self.cookies = await self.browser_context.cookies()
@@ -35,7 +48,7 @@ class XiaoHongShuCrawler(Crawler):
        async with async_playwright() as playwright:
            # launch browser and create single browser context
            chromium = playwright.chromium
-            browser = await chromium.launch(headless=True)
+            browser = await chromium.launch(headless=False)
            self.browser_context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
                user_agent=self.user_agent,
@@ -45,7 +58,7 @@ class XiaoHongShuCrawler(Crawler):
            # execute JS to bypass anti automation/crawler detection
            await self.browser_context.add_init_script(path="libs/stealth.min.js")
            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(self.login_url)
+            await self.context_page.goto(self.index_url)

            # scan qrcode login
            await self.login()
@@ -67,59 +80,106 @@ class XiaoHongShuCrawler(Crawler):
            )

            # Search for notes and retrieve their comment information.
-            note_res = await self.search_posts()
-            for post_item in note_res.get("items"):
-                note_id = post_item.get("id")
-                await self.get_comments(note_id=note_id)
-                await asyncio.sleep(1)
+            await self.search_posts()

            # block main crawler coroutine
            await asyncio.Event().wait()

    async def login(self):
        """login xiaohongshu website and keep webdriver login state"""
-        print("Begin login xiaohongshu ...")
+        # There are two ways to log in:
+        # 1. Semi-automatic: Log in by scanning the QR code.
+        # 2. Fully automatic: Log in using forwarded text message notifications
+        #  which includes mobile phone number and verification code.
+        if self.login_type == "qrcode":
+            await self.login_by_qrcode()
+        elif self.login_type == "phone":
+            await self.login_by_mobile()
+        else:
+            pass
+
+    async def login_by_mobile(self):
+        print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
+        login_container_ele = await self.context_page.wait_for_selector("div.login-container")
+        # Fill login phone
+        input_ele = await login_container_ele.query_selector("label.phone > input")
+        await input_ele.fill(self.login_phone)
+        await asyncio.sleep(0.5)
+
+        # Click to send verification code and fill it from redis server.
+        send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
+        await send_btn_ele.click()
+        sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
+        submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
+        redis_obj = aioredis.from_url(url=config.redis_db_host, password=config.redis_db_pwd, decode_responses=True)
+        max_get_sms_code_time = 60 * 2
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        no_logged_in_session = cookie_dict.get("web_session")
+        while max_get_sms_code_time > 0:
+            print(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
+            await asyncio.sleep(1)
+            sms_code_key = f"xhs_{self.login_phone}"
+            sms_code_value = await redis_obj.get(sms_code_key)
+            if not sms_code_value:
+                max_get_sms_code_time -= 1
+                continue
+
+            await sms_code_input_ele.fill(value=sms_code_value)  # Enter SMS verification code.
+            await asyncio.sleep(0.5)
+            agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
+            await agree_privacy_ele.click()  # Click "Agree" to the privacy policy.
+            await asyncio.sleep(0.5)
+
+            await submit_btn_ele.click()  # Click login button
+            # todo ... It is necessary to check the correctness of the verification code,
+            #  as it is possible that the entered verification code is incorrect.
+            break
+
+        login_flag: bool = await self.check_login_state(no_logged_in_session)
+        if not login_flag:
+            print("login failed please confirm sms code ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_qrcode(self):
+        """login xiaohongshu website and keep webdriver login state"""
+        print("Start scanning QR code to log in to Xiaohongshu. ...")

        # find login qrcode
        base64_qrcode_img = await utils.find_login_qrcode(
            self.context_page,
            selector="div.login-container > div.left > div.qrcode > img"
        )
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        no_logged_in_session = cookie_dict.get("web_session")
        if not base64_qrcode_img:
-
-            if await self.check_login_state(no_logged_in_session):
-                return
            # todo ...if this website does not automatically popup login dialog box, we will manual click login button
            print("login failed , have not found qrcode please check ....")
            sys.exit()

+        # get not logged session
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        no_logged_in_session = cookie_dict.get("web_session")
+
        # show login qrcode
        utils.show_qrcode(base64_qrcode_img)
-
-        while self.scan_qrcode_time > 0:
-            await asyncio.sleep(1)
-            self.scan_qrcode_time -= 1
-            print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds")
-            # get login state from browser
-            if await self.check_login_state(no_logged_in_session):
-                # If the QR code login is successful, you need to wait for a moment.
-                # Because there will be a second redirection after successful login
-                # executing JS during this period may be performed in a Page that has already been destroyed.
-                wait_for_seconds = 5
-                print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...")
-                while wait_for_seconds > 0:
-                    await asyncio.sleep(1)
-                    print(f"remaining wait {wait_for_seconds} seconds ...")
-                    wait_for_seconds -= 1
-                break
-        else:
+        print(f"waiting for scan code login, remaining time is 20s")
+        login_flag: bool = await self.check_login_state(no_logged_in_session)
+        if not login_flag:
+            print("login failed please confirm ...")
            sys.exit()

+        wait_redirect_seconds = 5
+        print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    @retry(stop=stop_after_attempt(30), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
    async def check_login_state(self, no_logged_in_session: str) -> bool:
        """Check if the current login status is successful and return True otherwise return False"""
+        # If login is unsuccessful, a retry exception will be thrown.
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        current_web_session = cookie_dict.get("web_session")
@@ -128,26 +188,37 @@ class XiaoHongShuCrawler(Crawler):
        return False

    async def search_posts(self):
-        # This function only retrieves the first 10 note
-        # And you can continue to make requests to obtain more by checking the boolean status of "has_more".
-        print("Begin search xiaohongshu keywords: ", self.keywords)
-        posts_res = await self.xhs_client.get_note_by_keyword(keyword=self.keywords)
-        for post_item in posts_res.get("items"):
-            note_id = post_item.get("id")
-            title = post_item.get("note_card", {}).get("display_title")
-            print(f"Note ID:{note_id}; Title:{title}")
-            # todo record note or save to db or csv
-        return posts_res
+        print("Begin search xiaohongshu keywords")
+        # It is possible to modify the source code to allow for the passing of a batch of keywords.
+        for keyword in [self.keywords]:
+            note_list: List[str] = []
+            max_note_len = 10
+            page = 1
+            while max_note_len > 0:
+                posts_res = await self.xhs_client.get_note_by_keyword(
+                    keyword=keyword,
+                    page=page,
+                )
+                page += 1
+                for post_item in posts_res.get("items"):
+                    max_note_len -= 1
+                    note_id = post_item.get("id")
+                    note_detail = await self.xhs_client.get_note_by_id(note_id)
+                    await xhs_model.update_xhs_note(note_detail)
+                    await asyncio.sleep(0.05)
+                    note_list.append(note_id)
+            print(f"keyword:{keyword}, note_list:{note_list}")
+            await self.batch_get_note_comments(note_list)
+
+    async def batch_get_note_comments(self, note_list: List[str]):
+        task_list: List[Task] = []
+        for note_id in note_list:
+            task = asyncio.create_task(self.get_comments(note_id), name=note_id)
+            task_list.append(task)
+        await asyncio.wait(task_list)

    async def get_comments(self, note_id: str):
-        # This function only retrieves the first 10 comments
-        # And you can continue to make requests to obtain more by checking the boolean status of "has_more".
        print("Begin get note id comments ", note_id)
-        res = await self.xhs_client.get_note_comments(note_id=note_id)
-        # res = await self.xhs_client.get_note_all_comments(note_id=note_id)
-        for comment in res.get("comments"):
-            nick_name = comment.get("user_info").get("nickname")
-            comment_content = comment.get("content")
-            print(f"Nickname：{nick_name}; Comment content：{comment_content}")
-            # todo save to db or csv
-        return res
+        all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
+        for comment in all_comments:
+            await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)