Merge branch 'NanmiCoder:main' into main

2026-06-09 03:17:25 +08:00 · 2025-11-07 17:44:09 +08:00
parent 7888f4c6bd 60cbb3e37d
commit 58eb89f073
18 changed files with 168 additions and 79 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/config/xhs_config.py
+++ b/config/xhs_config.py
@@ -17,7 +17,7 @@ SORT_TYPE = "popularity_descending"
 # 指定笔记URL列表, 必须要携带xsec_token参数
 XHS_SPECIFIED_NOTE_URL_LIST = [
-    "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
+    "https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
    # ........................
 ]
--- a/docs/.vitepress/config.mjs
+++ b/docs/.vitepress/config.mjs
@@ -59,7 +59,6 @@ export default defineConfig({
                        text: 'MediaCrawler源码剖析课',
                        link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh'
                    },
                    {text: '知识星球文章专栏', link: '/知识星球介绍'},
                    {text: '开发者咨询服务', link: '/开发者咨询'},
                ]
            },
--- a/docs/作者介绍.md
+++ b/docs/作者介绍.md
@@ -1,12 +1,12 @@
 # 关于作者
-> 大家都叫我阿江，网名：程序员阿江-Relakkes，目前裸辞正探索自由职业，希望能靠自己的技术能力和努力，实现自己理想的生活方式。
+> 大家都叫我阿江，网名：程序员阿江-Relakkes，目前是一名独立开发者，专注于 AI Agent 和爬虫相关的开发工作，All in AI。
 > 
 > 我身边有大量的技术人脉资源，如果大家有一些爬虫咨询或者编程单子可以向我丢过来
 - [Github万星开源自媒体爬虫仓库MediaCrawler作者](https://github.com/NanmiCoder/MediaCrawler)
 - 全栈程序员，熟悉Python、Golang、JavaScript，工作中主要用Golang。
 - 曾经主导并参与过百万级爬虫采集系统架构设计与编码
 - 爬虫是一种技术兴趣爱好，参与爬虫有一种对抗的感觉，越难越兴奋。
 - 目前专注于 AI Agent 领域，积极探索 AI 技术的应用与创新
 - 如果你有 AI Agent 相关的项目需要合作，欢迎联系我，我有很多时间可以投入
 ## 微信联系方式
 ![relakkes_weichat.JPG](static/images/relakkes_weichat.jpg)
--- a/docs/知识付费介绍.md
+++ b/docs/知识付费介绍.md
@@ -15,5 +15,3 @@
 ## MediaCrawler源码剖析视频课程
 [mediacrawler源码课程介绍](https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh)
 ## 知识星球爬虫逆向、编程专栏
 [知识星球专栏介绍](知识星球介绍.md)
--- a/docs/知识星球介绍.md
+++ b/docs/知识星球介绍.md
@@ -1,31 +0,0 @@
 # 知识星球专栏
 ## 基本介绍
 文章：
 - 1.爬虫JS逆向案例分享
 - 2.MediaCrawler技术实现分享。
 - 3.沉淀python开发经验和技巧
 - ......................
 提问：
 - 4.在星球内向我提问关于MediaCrawler、爬虫、编程任何问题
 ## 章节内容
  - [逆向案例 - 某16x8平台商品列表接口逆向参数分析](https://articles.zsxq.com/id_x1qmtg8pzld9.html)
  - [逆向案例 - Product Hunt月度最佳产品榜单接口加密参数分析](https://articles.zsxq.com/id_au4eich3x2sg.html)
  - [逆向案例 - 某zhi乎x-zse-96参数分析过程](https://articles.zsxq.com/id_dui2vil0ag1l.html)
  - [逆向案例 - 某x识星球X-Signature加密参数分析过程](https://articles.zsxq.com/id_pp4madwcwcg8.html)
  - [【独创】使用Playwright获取某音a_bogus参数流程（包含加密参数分析）](https://articles.zsxq.com/id_u89al50jk9x0.html)
  - [【独创】使用Playwright低成本获取某书X-s参数流程分析（当年的回忆录）](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
  - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
  - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) 
  - [一次Mysql数据库中混用collation排序规则带来的bug](https://articles.zsxq.com/id_pibwr1wnst2p.html)
  - [错误使用 Python 可变类型带来的隐藏 Bug](https://articles.zsxq.com/id_f7vn89l1d303.html)
  - [【MediaCrawler】微博帖子评论爬虫教程](https://articles.zsxq.com/id_vrmuhw0ovj3t.html)
  - [Python协程在并发场景下的幂等性问题](https://articles.zsxq.com/id_wocdwsfmfcmp.html)
  - ........................................
 ## 加入星球
 ![星球qrcode.JPG](static/images/星球qrcode.jpg)
--- a/main.py
+++ b/main.py
@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
 from media_platform.weibo import WeiboCrawler
 from media_platform.xhs import XiaoHongShuCrawler
 from media_platform.zhihu import ZhihuCrawler
 from tools.async_file_writer import AsyncFileWriter
 from var import crawler_type_var
 class CrawlerFactory:
@@ -72,6 +74,18 @@ async def main():
    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
    await crawler.start()
    # Generate wordcloud after crawling is complete
    # Only for JSON save mode
    if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
        try:
            file_writer = AsyncFileWriter(
                platform=config.PLATFORM,
                crawler_type=crawler_type_var.get()
            )
            await file_writer.generate_wordcloud_from_comments()
        except Exception as e:
            print(f"Error generating wordcloud: {e}")
 def cleanup():
    if crawler:
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@@ -91,8 +91,10 @@ class DouYinClient(AbstractApiClient):
        post_data = {}
        if request_method == "POST":
            post_data = params
-        a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
+
-        params["a_bogus"] = a_bogus
+        if "/v1/web/general/search" not in uri:
            a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
            params["a_bogus"] = a_bogus
    async def request(self, method, url, **kwargs):
        async with httpx.AsyncClient(proxy=self.proxy) as client:
--- a/media_platform/weibo/client.py
+++ b/media_platform/weibo/client.py
@@ -288,27 +288,14 @@ class WeiboClient:
        """
        uri = "/api/container/getIndex"
-        container_info = await self.get_creator_container_info(creator_id)
+        containerid = f"100505{creator_id}"
        if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
            utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
            raise DataFetchError("get containerid failed")
        params = {
            "jumpfrom": "weibocom",
            "type": "uid",
            "value": creator_id,
-            "containerid": container_info["fid_container_id"],
+            "containerid":containerid,
        }
        user_res = await self.get(uri, params)
        if user_res.get("tabsInfo"):
            tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
            for tab in tabs:
                if tab.get("tabKey") == "weibo":
                    container_info["lfid_container_id"] = tab.get("containerid")
                    break
        user_res.update(container_info)
        return user_res
    async def get_notes_by_creator(
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -293,7 +293,7 @@ class WeiboCrawler(AbstractCrawler):
                # Get all note information of the creator
                all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
                    creator_id=user_id,
-                    container_id=createor_info_res.get("lfid_container_id"),
+                    container_id=f"107603{user_id}",
                    crawl_interval=0,
                    callback=weibo_store.batch_update_weibo_notes,
                )
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -10,23 +10,24 @@
 import asyncio
 import json
-import re
+import time
 from typing import Any, Callable, Dict, List, Optional, Union
 from urllib.parse import urlencode
 import httpx
 from playwright.async_api import BrowserContext, Page
-from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
+from tenacity import retry, stop_after_attempt, wait_fixed
 import config
 from base.base_crawler import AbstractApiClient
 from tools import utils
-from html import unescape
+
 from .exception import DataFetchError, IPBlockError
 from .field import SearchNoteType, SearchSortType
 from .help import get_search_id, sign
 from .extractor import XiaoHongShuExtractor
 from .secsign import seccore_signv2_playwright
 class XiaoHongShuClient(AbstractApiClient):
@@ -63,15 +64,13 @@ class XiaoHongShuClient(AbstractApiClient):
        Returns:
        """
-        encrypt_params = await self.playwright_page.evaluate(
+        x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
            "([url, data]) => window._webmsxyw(url,data)", [url, data]
        )
        local_storage = await self.playwright_page.evaluate("() => window.localStorage")
        signs = sign(
            a1=self.cookie_dict.get("a1", ""),
            b1=local_storage.get("b1", ""),
-            x_s=encrypt_params.get("X-s", ""),
+            x_s=x_s,
-            x_t=str(encrypt_params.get("X-t", "")),
+            x_t=str(int(time.time())),
        )
        headers = {
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -282,16 +282,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
        async with semaphore:
            try:
                utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
-
+                note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
                try:
                    note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
                except RetryError:
                    pass
                if not note_detail:
-                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
+                    raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
                    if not note_detail:
                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
                note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
--- a/media_platform/xhs/help.py
+++ b/media_platform/xhs/help.py
@@ -27,16 +27,17 @@ def sign(a1="", b1="", x_s="", x_t=""):
        "s0": 3,  # getPlatformCode
        "s1": "",
        "x0": "1",  # localStorage.getItem("b1b1")
-        "x1": "3.7.8-2",  # version
+        "x1": "4.2.2",  # version
        "x2": "Mac OS",
        "x3": "xhs-pc-web",
-        "x4": "4.27.2",
+        "x4": "4.74.0",
        "x5": a1,  # cookie of a1
        "x6": x_t,
        "x7": x_s,
        "x8": b1,  # localStorage.getItem("b1")
        "x9": mrc(x_t + x_s + b1),
        "x10": 154,  # getSigCount
        "x11": "normal"
    }
    encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
    x_s_common = b64Encode(encode_str)
--- a/media_platform/xhs/secsign.py
+++ b/media_platform/xhs/secsign.py
@@ -0,0 +1,66 @@
 # -*- coding: utf-8 -*-
 # 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
 # 1. 不得用于任何商业用途。
 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
 # 3. 不得进行大规模爬取或对平台造成运营干扰。
 # 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
 #
 # 详细许可条款请参阅项目根目录下的LICENSE文件。
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 import hashlib
 import base64
 import json
 from typing import Any
 def _build_c(e: Any, a: Any) -> str:
    c = str(e)
    if isinstance(a, (dict, list)):
        c += json.dumps(a, separators=(",", ":"), ensure_ascii=False)
    elif isinstance(a, str):
        c += a
    # 其它类型不拼
    return c
 # ---------------------------
 # p.Pu = MD5(c) => hex 小写
 # ---------------------------
 def _md5_hex(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()
 # ============================================================
 # Playwright 版本（异步）：传入 page（Page 对象）
 #    内部用 page.evaluate('window.mnsv2(...)')
 # ============================================================
 async def seccore_signv2_playwright(
    page,  # Playwright Page
    e: Any,
    a: Any,
 ) -> str:
    """
    使用 Playwright 的 page.evaluate 调用 window.mnsv2(c, d) 来生成签名。
    需确保 page 上下文中已存在 window.mnsv2（比如已注入目标站点脚本）。
    用法：
      s = await page.evaluate("(c, d) => window.mnsv2(c, d)", c, d)
    """
    c = _build_c(e, a)
    d = _md5_hex(c)
    # 调用浏览器上下文里的 window.mnsv2
    s = await page.evaluate("(c, d) => window.mnsv2(c, d)", [c, d])
    f = {
        "x0": "4.2.6",
        "x1": "xhs-pc-web",
        "x2": "Mac OS",
        "x3": s,
        "x4": a,
    }
    payload = json.dumps(f, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
    token = "XYS_" + base64.b64encode(payload).decode("ascii")
    print(token)
    return token
--- a/store/bilibili/_store_impl.py
+++ b/store/bilibili/_store_impl.py
@@ -38,7 +38,7 @@ class BiliCsvStoreImplement(AbstractStore):
    def __init__(self):
        self.file_writer = AsyncFileWriter(
            crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
        )
    async def store_content(self, content_item: Dict):
@@ -221,7 +221,7 @@ class BiliJsonStoreImplement(AbstractStore):
    def __init__(self):
        self.file_writer = AsyncFileWriter(
            crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
        )
    async def store_content(self, content_item: Dict):
--- a/store/bilibili/bilibilli_store_media.py
+++ b/store/bilibili/bilibilli_store_media.py
@@ -22,7 +22,7 @@ from tools import utils
 class BilibiliVideo(AbstractStoreVideo):
-    video_store_path: str = "data/bilibili/videos"
+    video_store_path: str = "data/bili/videos"
    async def store_video(self, video_content_item: Dict):
        """
--- a/tools/async_file_writer.py
+++ b/tools/async_file_writer.py
@@ -5,13 +5,16 @@ import os
 import pathlib
 from typing import Dict, List
 import aiofiles
 import config
 from tools.utils import utils
 from tools.words import AsyncWordCloudGenerator
 class AsyncFileWriter:
    def __init__(self, platform: str, crawler_type: str):
        self.lock = asyncio.Lock()
        self.platform = platform
        self.crawler_type = crawler_type
        self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
    def _get_file_path(self, file_type: str, item_type: str) -> str:
        base_path = f"data/{self.platform}/{file_type}"
@@ -48,3 +51,57 @@ class AsyncFileWriter:
            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
    async def generate_wordcloud_from_comments(self):
        """
        Generate wordcloud from comments data
        Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
        """
        if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
            return
        if not self.wordcloud_generator:
            return
        try:
            # Read comments from JSON file
            comments_file_path = self._get_file_path('json', 'comments')
            if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
                return
            async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
                content = await f.read()
                if not content:
                    utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
                    return
                comments_data = json.loads(content)
                if not isinstance(comments_data, list):
                    comments_data = [comments_data]
            # Filter comments data to only include 'content' field
            # Handle different comment data structures across platforms
            filtered_data = []
            for comment in comments_data:
                if isinstance(comment, dict):
                    # Try different possible content field names
                    content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
                    if content_text:
                        filtered_data.append({'content': content_text})
            if not filtered_data:
                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
                return
            # Generate wordcloud
            words_base_path = f"data/{self.platform}/words"
            pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
            words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
            await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
        except Exception as e:
            utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -26,6 +26,10 @@ def init_loging_config():
    )
    _logger = logging.getLogger("MediaCrawler")
    _logger.setLevel(level)
    # 关闭 httpx 的 INFO 日志
    logging.getLogger("httpx").setLevel(logging.WARNING)
    return _logger