feat: 百度贴吧架子 & 登录done

2026-06-07 18:37:30 +08:00 · 2024-08-05 18:51:51 +08:00
parent 1c2237a66f
commit a87094f2fd
11 changed files with 1058 additions and 4 deletions
--- a/media_platform/tieba/init.py
+++ b/media_platform/tieba/init.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+from .core import TieBaCrawler
--- a/media_platform/tieba/client.py
+++ b/media_platform/tieba/client.py
@@ -0,0 +1,169 @@
+import asyncio
+import json
+import re
+from typing import Any, Callable, Dict, List, Optional, Union
+from urllib.parse import urlencode
+
+import httpx
+from playwright.async_api import BrowserContext, Page
+
+import config
+from base.base_crawler import AbstractApiClient
+from tools import utils
+
+from .field import SearchNoteType, SearchSortType
+
+
+class BaiduTieBaClient(AbstractApiClient):
+    def __init__(
+            self,
+            timeout=10,
+            proxies=None,
+            *,
+            headers: Dict[str, str],
+            playwright_page: Page,
+            cookie_dict: Dict[str, str],
+    ):
+        self.proxies = proxies
+        self.timeout = timeout
+        self.headers = headers
+        self.playwright_page = playwright_page
+        self.cookie_dict = cookie_dict
+        self._host = "https://tieba.baidu.com"
+
+    async def request(self, method, url, **kwargs) -> Union[str, Any]:
+        """
+        封装httpx的公共请求方法，对请求响应做一些处理
+        Args:
+            method: 请求方法
+            url: 请求的URL
+            **kwargs: 其他请求参数，例如请求头、请求体等
+
+        Returns:
+
+        """
+        # return response.text
+        return_response = kwargs.pop('return_response', False)
+
+        async with httpx.AsyncClient(proxies=self.proxies) as client:
+            response = await client.request(
+                method, url, timeout=self.timeout,
+                **kwargs
+            )
+
+        if return_response:
+            return response.text
+
+        return response.json()
+
+    async def get(self, uri: str, params=None) -> Dict:
+        """
+        GET请求，对请求头签名
+        Args:
+            uri: 请求路由
+            params: 请求参数
+
+        Returns:
+
+        """
+        final_uri = uri
+        if isinstance(params, dict):
+            final_uri = (f"{uri}?"
+                         f"{urlencode(params)}")
+        return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers)
+
+    async def post(self, uri: str, data: dict) -> Dict:
+        """
+        POST请求，对请求头签名
+        Args:
+            uri: 请求路由
+            data: 请求体参数
+
+        Returns:
+
+        """
+        json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
+        return await self.request(method="POST", url=f"{self._host}{uri}",
+                                  data=json_str, headers=self.headers)
+
+    async def pong(self) -> bool:
+        """
+        用于检查登录态是否失效了
+        Returns:
+
+        """
+        utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
+        try:
+            uri = "/mo/q/sync"
+            res: Dict = await self.get(uri)
+            if res and res.get("no") == 0:
+                ping_flag = True
+            else:
+                utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
+                ping_flag = False
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
+            ping_flag = False
+        return ping_flag
+
+    async def update_cookies(self, browser_context: BrowserContext):
+        """
+        API客户端提供的更新cookies方法，一般情况下登录成功后会调用此方法
+        Args:
+            browser_context: 浏览器上下文对象
+
+        Returns:
+
+        """
+        cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+        self.headers["Cookie"] = cookie_str
+        self.cookie_dict = cookie_dict
+
+    async def get_note_by_keyword(
+            self, keyword: str,
+            page: int = 1,
+            page_size: int = 10,
+            sort: SearchSortType = SearchSortType.TIME_DESC,
+            note_type: SearchNoteType = SearchNoteType.FIXED_THREAD
+    ) -> Dict:
+        """
+        根据关键词搜索贴吧帖子
+        Args:
+            keyword: 关键词
+            page: 分页第几页
+            page_size: 每页肠病毒
+            sort: 结果排序方式
+            note_type: 帖子类型（主题贴｜主题+回复混合模式）
+
+        Returns:
+
+        """
+        # todo impl it
+        return {}
+
+    async def get_note_by_id(self, note_id: str) -> Dict:
+        """
+        根据帖子ID获取帖子详情
+        Args:
+            note_id:
+
+        Returns:
+
+        """
+        # todo impl it
+        return {}
+
+    async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
+                                    callback: Optional[Callable] = None) -> List[Dict]:
+        """
+        获取指定帖子下的所有一级评论，该方法会一直查找一个帖子下的所有评论信息
+        Args:
+            note_id: 帖子ID
+            crawl_interval: 爬取一次笔记的延迟单位（秒）
+            callback: 一次笔记爬取结束后
+
+        Returns:
+
+        """
+        # todo impl it
+        return []
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -0,0 +1,265 @@
+import asyncio
+import os
+import random
+from asyncio import Task
+from typing import Dict, List, Optional, Tuple
+
+from playwright.async_api import (BrowserContext, BrowserType, Page,
+                                  async_playwright)
+
+import config
+from base.base_crawler import AbstractCrawler
+from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import tieba as tieba_store
+from tools import utils
+from var import crawler_type_var
+
+from .client import BaiduTieBaClient
+from .field import SearchNoteType, SearchSortType
+from .login import BaiduTieBaLogin
+
+
+class TieBaCrawler(AbstractCrawler):
+    context_page: Page
+    tieba_client: BaiduTieBaClient
+    browser_context: BrowserContext
+
+    def __init__(self) -> None:
+        self.index_url = "https://tieba.baidu.com"
+        self.user_agent = utils.get_user_agent()
+
+    async def start(self) -> None:
+        playwright_proxy_format, httpx_proxy_format = None, None
+        if config.ENABLE_IP_PROXY:
+            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
+            ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
+            playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info)
+
+        async with async_playwright() as playwright:
+            # Launch a browser context.
+            chromium = playwright.chromium
+            self.browser_context = await self.launch_browser(
+                chromium,
+                None,
+                self.user_agent,
+                headless=config.HEADLESS
+            )
+            # stealth.min.js is a js script to prevent the website from detecting the crawler.
+            await self.browser_context.add_init_script(path="libs/stealth.min.js")
+            self.context_page = await self.browser_context.new_page()
+            await self.context_page.goto(self.index_url)
+
+            # Create a client to interact with the baidutieba website.
+            self.tieba_client = await self.create_tieba_client(httpx_proxy_format)
+            if not await self.tieba_client.pong():
+                login_obj = BaiduTieBaLogin(
+                    login_type=config.LOGIN_TYPE,
+                    login_phone="",  # input your phone number
+                    browser_context=self.browser_context,
+                    context_page=self.context_page,
+                    cookie_str=config.COOKIES
+                )
+                await login_obj.begin()
+                await self.tieba_client.update_cookies(browser_context=self.browser_context)
+
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
+                # Search for notes and retrieve their comment information.
+                await self.search()
+            elif config.CRAWLER_TYPE == "detail":
+                # Get the information and comments of the specified post
+                await self.get_specified_notes()
+            else:
+                pass
+
+            utils.logger.info("[BaiduTieBaCrawler.start] Xhs Crawler finished ...")
+
+    async def search(self) -> None:
+        """Search for notes and retrieve their comment information."""
+        utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords")
+        tieba_limit_count = 10  # tieba limit page fixed value
+        if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
+            config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
+            utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}")
+            page = 1
+            while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}")
+                    page += 1
+                    continue
+                try:
+                    utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}")
+                    note_id_list: List[str] = []
+                    notes_res = await self.tieba_client.get_note_by_keyword(
+                        keyword=keyword,
+                        page=page,
+                        page_size=tieba_limit_count,
+                        sort=SearchSortType.TIME_DESC,
+                        note_type=SearchNoteType.FIXED_THREAD
+                    )
+                    utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_res}")
+                    if not notes_res or not notes_res.get('has_more', False):
+                        utils.logger.info("No more content!")
+                        break
+                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                    task_list = [
+                        self.get_note_detail(
+                            note_id=post_item.get("id"),
+                            semaphore=semaphore
+                        )
+                        for post_item in notes_res.get("items", {})
+                        if post_item.get('model_type') not in ('rec_query', 'hot_query')
+                    ]
+                    note_details = await asyncio.gather(*task_list)
+                    for note_detail in note_details:
+                        if note_detail:
+                            await tieba_store.update_tieba_note(note_detail)
+                            note_id_list.append(note_detail.get("note_id"))
+                    page += 1
+                    utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {note_details}")
+                    await self.batch_get_note_comments(note_id_list)
+                except Exception as ex:
+                    utils.logger.error(f"[BaiduTieBaCrawler.search] Get note detail error, err: {ex}")
+                    break
+
+    async def fetch_creator_notes_detail(self, note_list: List[Dict]):
+        """
+        Concurrently obtain the specified post list and save the data
+        """
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_note_detail(
+                note_id=post_item.get("note_id"),
+                semaphore=semaphore
+            )
+            for post_item in note_list
+        ]
+
+        note_details = await asyncio.gather(*task_list)
+        for note_detail in note_details:
+            if note_detail:
+                await tieba_store.update_tieba_note(note_detail)
+
+    async def get_specified_notes(self):
+        """Get the information and comments of the specified post"""
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.TIEBA_SPECIFIED_ID_LIST
+        ]
+        note_details = await asyncio.gather(*task_list)
+        for note_detail in note_details:
+            if note_detail is not None:
+                await tieba_store.update_tieba_note(note_detail)
+        await self.batch_get_note_comments(config.TIEBA_SPECIFIED_ID_LIST)
+
+    async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
+        """Get note detail"""
+        async with semaphore:
+            try:
+                note_detail: Dict = await self.tieba_client.get_note_by_id(note_id)
+                if not note_detail:
+                    utils.logger.error(
+                        f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}")
+                    return None
+                return note_detail
+            except Exception as ex:
+                utils.logger.error(f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}")
+                return None
+            except KeyError as ex:
+                utils.logger.error(
+                    f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
+                return None
+
+    async def batch_get_note_comments(self, note_list: List[str]):
+        """Batch get note comments"""
+        if not config.ENABLE_GET_COMMENTS:
+            utils.logger.info(f"[BaiduTieBaCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
+            return
+
+        utils.logger.info(
+            f"[BaiduTieBaCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list: List[Task] = []
+        for note_id in note_list:
+            task = asyncio.create_task(self.get_comments(note_id, semaphore), name=note_id)
+            task_list.append(task)
+        await asyncio.gather(*task_list)
+
+    async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
+        """Get note comments with keyword filtering and quantity limitation"""
+        async with semaphore:
+            utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_id}")
+            await self.tieba_client.get_note_all_comments(
+                note_id=note_id,
+                crawl_interval=random.random(),
+                callback=tieba_store.batch_update_tieba_note_comments
+            )
+
+    @staticmethod
+    def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
+        """format proxy info for playwright and httpx"""
+        playwright_proxy = {
+            "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
+            "username": ip_proxy_info.user,
+            "password": ip_proxy_info.password,
+        }
+        httpx_proxy = {
+            f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
+        }
+        return playwright_proxy, httpx_proxy
+
+    async def create_tieba_client(self, httpx_proxy: Optional[str]) -> BaiduTieBaClient:
+        """Create tieba client"""
+        utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...")
+        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
+        tieba_client_obj = BaiduTieBaClient(
+            proxies=httpx_proxy,
+            headers={
+                "User-Agent": self.user_agent,
+                "Cookie": cookie_str,
+                "Origin": "https://www.baidutieba.com",
+                "Referer": "https://www.baidutieba.com",
+                "Content-Type": "application/json;charset=UTF-8"
+            },
+            playwright_page=self.context_page,
+            cookie_dict=cookie_dict,
+        )
+        return tieba_client_obj
+
+    async def launch_browser(
+            self,
+            chromium: BrowserType,
+            playwright_proxy: Optional[Dict],
+            user_agent: Optional[str],
+            headless: bool = True
+    ) -> BrowserContext:
+        """Launch browser and create browser context"""
+        utils.logger.info("[BaiduTieBaCrawler.launch_browser] Begin create browser context ...")
+        if config.SAVE_LOGIN_STATE:
+            # feat issue #14
+            # we will save login state to avoid login every time
+            user_data_dir = os.path.join(os.getcwd(), "browser_data",
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
+            browser_context = await chromium.launch_persistent_context(
+                user_data_dir=user_data_dir,
+                accept_downloads=True,
+                headless=headless,
+                proxy=playwright_proxy,  # type: ignore
+                viewport={"width": 1920, "height": 1080},
+                user_agent=user_agent
+            )
+            return browser_context
+        else:
+            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
+            browser_context = await browser.new_context(
+                viewport={"width": 1920, "height": 1080},
+                user_agent=user_agent
+            )
+            return browser_context
+
+    async def close(self):
+        """Close browser context"""
+        await self.browser_context.close()
+        utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
--- a/media_platform/tieba/field.py
+++ b/media_platform/tieba/field.py
@@ -0,0 +1,18 @@
+from enum import Enum
+
+
+class SearchSortType(Enum):
+    """search sort type"""
+    # 按时间倒序
+    TIME_DESC = "1"
+    # 按时间顺序
+    TIME_ASC = "0"
+    # 按相关性顺序
+    RELEVANCE_ORDER = "2"
+
+
+class SearchNoteType(Enum):
+    # 只看主题贴
+    MAIN_THREAD = "1"
+    # 混合模式（帖子+回复）
+    FIXED_THREAD = "0"
--- a/media_platform/tieba/login.py
+++ b/media_platform/tieba/login.py
@@ -0,0 +1,112 @@
+import asyncio
+import functools
+import sys
+from typing import Optional
+
+from playwright.async_api import BrowserContext, Page
+from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
+                      wait_fixed)
+
+import config
+from base.base_crawler import AbstractLogin
+from tools import utils
+
+
+class BaiduTieBaLogin(AbstractLogin):
+
+    def __init__(self,
+                 login_type: str,
+                 browser_context: BrowserContext,
+                 context_page: Page,
+                 login_phone: Optional[str] = "",
+                 cookie_str: str = ""
+                 ):
+        config.LOGIN_TYPE = login_type
+        self.browser_context = browser_context
+        self.context_page = context_page
+        self.login_phone = login_phone
+        self.cookie_str = cookie_str
+
+    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
+    async def check_login_state(self) -> bool:
+        """
+        轮训检查登录状态是否成功，成功返回True否则返回False
+
+        Returns:
+
+        """
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        stoken = cookie_dict.get("STOKEN")
+        ptoken = cookie_dict.get("PTOKEN")
+        if stoken or ptoken:
+            return True
+        return False
+
+    async def begin(self):
+        """Start login baidutieba"""
+        utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...")
+        if config.LOGIN_TYPE == "qrcode":
+            await self.login_by_qrcode()
+        elif config.LOGIN_TYPE == "phone":
+            await self.login_by_mobile()
+        elif config.LOGIN_TYPE == "cookie":
+            await self.login_by_cookies()
+        else:
+            raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...")
+
+    async def login_by_mobile(self):
+        """Login baidutieba by mobile"""
+        pass
+
+    async def login_by_qrcode(self):
+        """login baidutieba website and keep webdriver login state"""
+        utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...")
+        qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']"
+        # find login qrcode
+        base64_qrcode_img = await utils.find_login_qrcode(
+            self.context_page,
+            selector=qrcode_img_selector
+        )
+        if not base64_qrcode_img:
+            utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
+            # if this website does not automatically popup login dialog box, we will manual click login button
+            await asyncio.sleep(0.5)
+            login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']")
+            await login_button_ele.click()
+            base64_qrcode_img = await utils.find_login_qrcode(
+                self.context_page,
+                selector=qrcode_img_selector
+            )
+            if not base64_qrcode_img:
+                utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
+                sys.exit()
+
+        # show login qrcode
+        # fix issue #12
+        # we need to use partial function to call show_qrcode function and run in executor
+        # then current asyncio event loop will not be blocked
+        partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
+        asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
+
+        utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
+        try:
+            await self.check_login_state()
+        except RetryError:
+            utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_cookies(self):
+        """login baidutieba website by cookies"""
+        utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...")
+        for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
+            await self.browser_context.add_cookies([{
+                'name': key,
+                'value': value,
+                'domain': ".baidu.com",
+                'path': "/"
+            }])