Merge branch 'NanmiCoder:main' into main

2026-05-08 19:47:40 +08:00 · 2025-11-07 17:44:09 +08:00
parent 7888f4c6bd 60cbb3e37d
commit 58eb89f073
18 changed files with 168 additions and 79 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/config/xhs_config.py
+++ b/config/xhs_config.py
@@ -17,7 +17,7 @@ SORT_TYPE = "popularity_descending"

 # 指定笔记URL列表, 必须要携带xsec_token参数
 XHS_SPECIFIED_NOTE_URL_LIST = [
-    "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
+    "https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
    # ........................
 ]

--- a/docs/.vitepress/config.mjs
+++ b/docs/.vitepress/config.mjs
@@ -59,7 +59,6 @@ export default defineConfig({
                        text: 'MediaCrawler源码剖析课',
                        link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh'
                    },
-                    {text: '知识星球文章专栏', link: '/知识星球介绍'},
                    {text: '开发者咨询服务', link: '/开发者咨询'},
                ]
            },
--- a/docs/作者介绍.md
+++ b/docs/作者介绍.md
@@ -1,12 +1,12 @@
 # 关于作者
-> 大家都叫我阿江，网名：程序员阿江-Relakkes，目前裸辞正探索自由职业，希望能靠自己的技术能力和努力，实现自己理想的生活方式。
-> 
-> 我身边有大量的技术人脉资源，如果大家有一些爬虫咨询或者编程单子可以向我丢过来
+> 大家都叫我阿江，网名：程序员阿江-Relakkes，目前是一名独立开发者，专注于 AI Agent 和爬虫相关的开发工作，All in AI。

 - [Github万星开源自媒体爬虫仓库MediaCrawler作者](https://github.com/NanmiCoder/MediaCrawler)
 - 全栈程序员，熟悉Python、Golang、JavaScript，工作中主要用Golang。
 - 曾经主导并参与过百万级爬虫采集系统架构设计与编码
 - 爬虫是一种技术兴趣爱好，参与爬虫有一种对抗的感觉，越难越兴奋。
+- 目前专注于 AI Agent 领域，积极探索 AI 技术的应用与创新
+- 如果你有 AI Agent 相关的项目需要合作，欢迎联系我，我有很多时间可以投入
 
 ## 微信联系方式
 ![relakkes_weichat.JPG](static/images/relakkes_weichat.jpg)
--- a/docs/知识付费介绍.md
+++ b/docs/知识付费介绍.md
@@ -15,5 +15,3 @@
 ## MediaCrawler源码剖析视频课程
 [mediacrawler源码课程介绍](https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh)

-## 知识星球爬虫逆向、编程专栏
-[知识星球专栏介绍](知识星球介绍.md)
--- a/docs/知识星球介绍.md
+++ b/docs/知识星球介绍.md
@@ -1,31 +0,0 @@
-# 知识星球专栏
-
-## 基本介绍
-
-文章：
- 1.爬虫JS逆向案例分享
- 2.MediaCrawler技术实现分享。
- 3.沉淀python开发经验和技巧
- ......................
-
-提问：
- 4.在星球内向我提问关于MediaCrawler、爬虫、编程任何问题
-
-## 章节内容
-  - [逆向案例 - 某16x8平台商品列表接口逆向参数分析](https://articles.zsxq.com/id_x1qmtg8pzld9.html)
-  - [逆向案例 - Product Hunt月度最佳产品榜单接口加密参数分析](https://articles.zsxq.com/id_au4eich3x2sg.html)
-  - [逆向案例 - 某zhi乎x-zse-96参数分析过程](https://articles.zsxq.com/id_dui2vil0ag1l.html)
-  - [逆向案例 - 某x识星球X-Signature加密参数分析过程](https://articles.zsxq.com/id_pp4madwcwcg8.html)
-  - [【独创】使用Playwright获取某音a_bogus参数流程（包含加密参数分析）](https://articles.zsxq.com/id_u89al50jk9x0.html)
-  - [【独创】使用Playwright低成本获取某书X-s参数流程分析（当年的回忆录）](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
-  - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
-  - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) 
-  - [一次Mysql数据库中混用collation排序规则带来的bug](https://articles.zsxq.com/id_pibwr1wnst2p.html)
-  - [错误使用 Python 可变类型带来的隐藏 Bug](https://articles.zsxq.com/id_f7vn89l1d303.html)
-  - [【MediaCrawler】微博帖子评论爬虫教程](https://articles.zsxq.com/id_vrmuhw0ovj3t.html)
-  - [Python协程在并发场景下的幂等性问题](https://articles.zsxq.com/id_wocdwsfmfcmp.html)
-  - ........................................
-
-## 加入星球
-![星球qrcode.JPG](static/images/星球qrcode.jpg)
-
--- a/main.py
+++ b/main.py
@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
 from media_platform.weibo import WeiboCrawler
 from media_platform.xhs import XiaoHongShuCrawler
 from media_platform.zhihu import ZhihuCrawler
+from tools.async_file_writer import AsyncFileWriter
+from var import crawler_type_var


 class CrawlerFactory:
@@ -72,6 +74,18 @@ async def main():
    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
    await crawler.start()

+    # Generate wordcloud after crawling is complete
+    # Only for JSON save mode
+    if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
+        try:
+            file_writer = AsyncFileWriter(
+                platform=config.PLATFORM,
+                crawler_type=crawler_type_var.get()
+            )
+            await file_writer.generate_wordcloud_from_comments()
+        except Exception as e:
+            print(f"Error generating wordcloud: {e}")
+

 def cleanup():
    if crawler:
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@@ -91,8 +91,10 @@ class DouYinClient(AbstractApiClient):
        post_data = {}
        if request_method == "POST":
            post_data = params
-        a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
-        params["a_bogus"] = a_bogus
+
+        if "/v1/web/general/search" not in uri:
+            a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
+            params["a_bogus"] = a_bogus

    async def request(self, method, url, **kwargs):
        async with httpx.AsyncClient(proxy=self.proxy) as client:
--- a/media_platform/weibo/client.py
+++ b/media_platform/weibo/client.py
@@ -288,27 +288,14 @@ class WeiboClient:

        """
        uri = "/api/container/getIndex"
-        container_info = await self.get_creator_container_info(creator_id)
-        if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
-            utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
-            raise DataFetchError("get containerid failed")
+        containerid = f"100505{creator_id}"
        params = {
            "jumpfrom": "weibocom",
            "type": "uid",
            "value": creator_id,
-            "containerid": container_info["fid_container_id"],
+            "containerid":containerid,
        }
-
        user_res = await self.get(uri, params)
-
-        if user_res.get("tabsInfo"):
-            tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
-            for tab in tabs:
-                if tab.get("tabKey") == "weibo":
-                    container_info["lfid_container_id"] = tab.get("containerid")
-                    break
-
-        user_res.update(container_info)
        return user_res

    async def get_notes_by_creator(
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -293,7 +293,7 @@ class WeiboCrawler(AbstractCrawler):
                # Get all note information of the creator
                all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
                    creator_id=user_id,
-                    container_id=createor_info_res.get("lfid_container_id"),
+                    container_id=f"107603{user_id}",
                    crawl_interval=0,
                    callback=weibo_store.batch_update_weibo_notes,
                )
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -10,23 +10,24 @@

 import asyncio
 import json
-import re
+import time
 from typing import Any, Callable, Dict, List, Optional, Union
 from urllib.parse import urlencode

 import httpx
 from playwright.async_api import BrowserContext, Page
-from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
+from tenacity import retry, stop_after_attempt, wait_fixed

 import config
 from base.base_crawler import AbstractApiClient
 from tools import utils
-from html import unescape
+

 from .exception import DataFetchError, IPBlockError
 from .field import SearchNoteType, SearchSortType
 from .help import get_search_id, sign
 from .extractor import XiaoHongShuExtractor
+from .secsign import seccore_signv2_playwright


 class XiaoHongShuClient(AbstractApiClient):
@@ -63,15 +64,13 @@ class XiaoHongShuClient(AbstractApiClient):
        Returns:

        """
-        encrypt_params = await self.playwright_page.evaluate(
-            "([url, data]) => window._webmsxyw(url,data)", [url, data]
-        )
+        x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
        local_storage = await self.playwright_page.evaluate("() => window.localStorage")
        signs = sign(
            a1=self.cookie_dict.get("a1", ""),
            b1=local_storage.get("b1", ""),
-            x_s=encrypt_params.get("X-s", ""),
-            x_t=str(encrypt_params.get("X-t", "")),
+            x_s=x_s,
+            x_t=str(int(time.time())),
        )

        headers = {
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -282,16 +282,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
        async with semaphore:
            try:
                utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
-
-                try:
-                    note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
-                except RetryError:
-                    pass
-
+                note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
                if not note_detail:
-                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
-                    if not note_detail:
-                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+                    raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")

                note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
                
--- a/media_platform/xhs/help.py
+++ b/media_platform/xhs/help.py
@@ -27,16 +27,17 @@ def sign(a1="", b1="", x_s="", x_t=""):
        "s0": 3,  # getPlatformCode
        "s1": "",
        "x0": "1",  # localStorage.getItem("b1b1")
-        "x1": "3.7.8-2",  # version
+        "x1": "4.2.2",  # version
        "x2": "Mac OS",
        "x3": "xhs-pc-web",
-        "x4": "4.27.2",
+        "x4": "4.74.0",
        "x5": a1,  # cookie of a1
        "x6": x_t,
        "x7": x_s,
        "x8": b1,  # localStorage.getItem("b1")
        "x9": mrc(x_t + x_s + b1),
        "x10": 154,  # getSigCount
+        "x11": "normal"
    }
    encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
    x_s_common = b64Encode(encode_str)
--- a/media_platform/xhs/secsign.py
+++ b/media_platform/xhs/secsign.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import hashlib
+import base64
+import json
+from typing import Any
+
+def _build_c(e: Any, a: Any) -> str:
+    c = str(e)
+    if isinstance(a, (dict, list)):
+        c += json.dumps(a, separators=(",", ":"), ensure_ascii=False)
+    elif isinstance(a, str):
+        c += a
+    # 其它类型不拼
+    return c
+
+
+# ---------------------------
+# p.Pu = MD5(c) => hex 小写
+# ---------------------------
+def _md5_hex(s: str) -> str:
+    return hashlib.md5(s.encode("utf-8")).hexdigest()
+
+
+
+# ============================================================
+# Playwright 版本（异步）：传入 page（Page 对象）
+#    内部用 page.evaluate('window.mnsv2(...)')
+# ============================================================
+async def seccore_signv2_playwright(
+    page,  # Playwright Page
+    e: Any,
+    a: Any,
+) -> str:
+    """
+    使用 Playwright 的 page.evaluate 调用 window.mnsv2(c, d) 来生成签名。
+    需确保 page 上下文中已存在 window.mnsv2（比如已注入目标站点脚本）。
+
+    用法：
+      s = await page.evaluate("(c, d) => window.mnsv2(c, d)", c, d)
+    """
+    c = _build_c(e, a)
+    d = _md5_hex(c)
+
+    # 调用浏览器上下文里的 window.mnsv2
+    s = await page.evaluate("(c, d) => window.mnsv2(c, d)", [c, d])
+    f = {
+        "x0": "4.2.6",
+        "x1": "xhs-pc-web",
+        "x2": "Mac OS",
+        "x3": s,
+        "x4": a,
+    }
+    payload = json.dumps(f, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
+    token = "XYS_" + base64.b64encode(payload).decode("ascii")
+    print(token)
+    return token
--- a/store/bilibili/_store_impl.py
+++ b/store/bilibili/_store_impl.py
@@ -38,7 +38,7 @@ class BiliCsvStoreImplement(AbstractStore):
    def __init__(self):
        self.file_writer = AsyncFileWriter(
            crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
        )

    async def store_content(self, content_item: Dict):
@@ -221,7 +221,7 @@ class BiliJsonStoreImplement(AbstractStore):
    def __init__(self):
        self.file_writer = AsyncFileWriter(
            crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
        )

    async def store_content(self, content_item: Dict):
--- a/store/bilibili/bilibilli_store_media.py
+++ b/store/bilibili/bilibilli_store_media.py
@@ -22,7 +22,7 @@ from tools import utils


 class BilibiliVideo(AbstractStoreVideo):
-    video_store_path: str = "data/bilibili/videos"
+    video_store_path: str = "data/bili/videos"

    async def store_video(self, video_content_item: Dict):
        """
--- a/tools/async_file_writer.py
+++ b/tools/async_file_writer.py
@@ -5,13 +5,16 @@ import os
 import pathlib
 from typing import Dict, List
 import aiofiles
+import config
 from tools.utils import utils
+from tools.words import AsyncWordCloudGenerator

 class AsyncFileWriter:
    def __init__(self, platform: str, crawler_type: str):
        self.lock = asyncio.Lock()
        self.platform = platform
        self.crawler_type = crawler_type
+        self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None

    def _get_file_path(self, file_type: str, item_type: str) -> str:
        base_path = f"data/{self.platform}/{file_type}"
@@ -47,4 +50,58 @@ class AsyncFileWriter:
            existing_data.append(item)

            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
-                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
+                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
+
+    async def generate_wordcloud_from_comments(self):
+        """
+        Generate wordcloud from comments data
+        Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
+        """
+        if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
+            return
+
+        if not self.wordcloud_generator:
+            return
+
+        try:
+            # Read comments from JSON file
+            comments_file_path = self._get_file_path('json', 'comments')
+            if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
+                return
+
+            async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
+                content = await f.read()
+                if not content:
+                    utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
+                    return
+
+                comments_data = json.loads(content)
+                if not isinstance(comments_data, list):
+                    comments_data = [comments_data]
+
+            # Filter comments data to only include 'content' field
+            # Handle different comment data structures across platforms
+            filtered_data = []
+            for comment in comments_data:
+                if isinstance(comment, dict):
+                    # Try different possible content field names
+                    content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
+                    if content_text:
+                        filtered_data.append({'content': content_text})
+
+            if not filtered_data:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
+                return
+
+            # Generate wordcloud
+            words_base_path = f"data/{self.platform}/words"
+            pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
+            words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
+
+            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
+            await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
+            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
+
+        except Exception as e:
+            utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -26,6 +26,10 @@ def init_loging_config():
    )
    _logger = logging.getLogger("MediaCrawler")
    _logger.setLevel(level)
+
+    # 关闭 httpx 的 INFO 日志
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
    return _logger