diff --git a/README.md b/README.md
index b33cd5b..e7b075f 100644
--- a/README.md
+++ b/README.md
@@ -51,8 +51,6 @@
| 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-
-🔗 🚀 MediaCrawlerPro 重磅发布!更多的功能,更好的架构设计!
### 🚀 MediaCrawlerPro 重磅发布!
@@ -77,7 +75,7 @@
- [ ] **基于自媒体平台的AI Agent正在开发中 🚀🚀**
点击查看:[MediaCrawlerPro 项目主页](https://github.com/MediaCrawlerPro) 更多介绍
-
+
## 🚀 快速开始
@@ -216,40 +214,11 @@ uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlit
uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
```
----
[🚀 MediaCrawlerPro 重磅发布 🚀!更多的功能,更好的架构设计!](https://github.com/MediaCrawlerPro)
-## 🤝 社区与支持
-
-### 💬 交流群组
-- **微信交流群**:[点击加入](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
-
-### 📚 文档与教程
-- **在线文档**:[MediaCrawler 完整文档](https://nanmicoder.github.io/MediaCrawler/)
-- **爬虫教程**:[CrawlerTutorial 免费教程](https://github.com/NanmiCoder/CrawlerTutorial)
-
-
-# 其他常见问题可以查看在线文档
->
-> 在线文档包含使用方法、常见问题、加入项目交流群等。
-> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
->
-
-# 作者提供的知识服务
-> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
-
-[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
-
-
---
-## ⭐ Star 趋势图
-
-如果这个项目对您有帮助,请给个 ⭐ Star 支持一下,让更多的人看到 MediaCrawler!
-
-[](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
-
### 💰 赞助商展示
@@ -288,6 +257,38 @@ Thordata是全球代理IP解决方案提供商,支持大规模采集公共网
- 邮箱:`relakkes@gmail.com`
+## 🤝 社区与支持
+
+### 💬 交流群组
+- **微信交流群**:[点击加入](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
+
+### 📚 文档与教程
+- **在线文档**:[MediaCrawler 完整文档](https://nanmicoder.github.io/MediaCrawler/)
+- **爬虫教程**:[CrawlerTutorial 免费教程](https://github.com/NanmiCoder/CrawlerTutorial)
+
+
+# 其他常见问题可以查看在线文档
+>
+> 在线文档包含使用方法、常见问题、加入项目交流群等。
+> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
+>
+
+# 作者提供的知识服务
+> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
+
+[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
+
+
+---
+
+## ⭐ Star 趋势图
+
+如果这个项目对您有帮助,请给个 ⭐ Star 支持一下,让更多的人看到 MediaCrawler!
+
+[](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
+
+
+
## 📚 参考
- **小红书客户端**:[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs)
diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py
index 982373a..f5bd02e 100644
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -26,6 +26,7 @@ from html import unescape
from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType
from .help import get_search_id, sign
+from .extractor import XiaoHongShuExtractor
class XiaoHongShuClient(AbstractApiClient):
@@ -50,6 +51,7 @@ class XiaoHongShuClient(AbstractApiClient):
self.NOTE_ABNORMAL_CODE = -510001
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
+ self._extractor = XiaoHongShuExtractor()
async def _pre_headers(self, url: str, data=None) -> Dict:
"""
@@ -61,7 +63,9 @@ class XiaoHongShuClient(AbstractApiClient):
Returns:
"""
- encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
+ encrypt_params = await self.playwright_page.evaluate(
+ "([url, data]) => window._webmsxyw(url,data)", [url, data]
+ )
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
signs = sign(
a1=self.cookie_dict.get("a1", ""),
@@ -128,7 +132,9 @@ class XiaoHongShuClient(AbstractApiClient):
if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}"
headers = await self._pre_headers(final_uri)
- return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
+ return await self.request(
+ method="GET", url=f"{self._host}{final_uri}", headers=headers
+ )
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
"""
@@ -156,12 +162,18 @@ class XiaoHongShuClient(AbstractApiClient):
response = await client.request("GET", url, timeout=self.timeout)
response.raise_for_status()
if not response.reason_phrase == "OK":
- utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}"
+ )
return None
else:
return response.content
- except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
- utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
+ except (
+ httpx.HTTPError
+ ) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
+ utils.logger.error(
+ f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}"
+ ) # 保留原始异常类型名称,以便开发者调试
return None
async def pong(self) -> bool:
@@ -178,7 +190,9 @@ class XiaoHongShuClient(AbstractApiClient):
if note_card.get("items"):
ping_flag = True
except Exception as e:
- utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
+ utils.logger.error(
+ f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again..."
+ )
ping_flag = False
return ping_flag
@@ -249,9 +263,7 @@ class XiaoHongShuClient(AbstractApiClient):
data = {
"source_note_id": note_id,
"image_formats": ["jpg", "webp", "avif"],
- "extra": {
- "need_body_topic": 1
- },
+ "extra": {"need_body_topic": 1},
"xsec_source": xsec_source,
"xsec_token": xsec_token,
}
@@ -261,7 +273,9 @@ class XiaoHongShuClient(AbstractApiClient):
res_dict: Dict = res["items"][0]["note_card"]
return res_dict
# 爬取频繁了可能会出现有的笔记能有结果有的没有
- utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}"
+ )
return dict()
async def get_note_comments(
@@ -345,15 +359,19 @@ class XiaoHongShuClient(AbstractApiClient):
comments_has_more = True
comments_cursor = ""
while comments_has_more and len(result) < max_count:
- comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
+ comments_res = await self.get_note_comments(
+ note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor
+ )
comments_has_more = comments_res.get("has_more", False)
comments_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res:
- utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}"
+ )
break
comments = comments_res["comments"]
if len(result) + len(comments) > max_count:
- comments = comments[:max_count - len(result)]
+ comments = comments[: max_count - len(result)]
if callback:
await callback(note_id, comments)
await asyncio.sleep(crawl_interval)
@@ -386,7 +404,9 @@ class XiaoHongShuClient(AbstractApiClient):
"""
if not config.ENABLE_GET_SUB_COMMENTS:
- utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+ utils.logger.info(
+ f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
+ )
return []
result = []
@@ -413,12 +433,16 @@ class XiaoHongShuClient(AbstractApiClient):
)
if comments_res is None:
- utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}"
+ )
continue
sub_comment_has_more = comments_res.get("has_more", False)
sub_comment_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res:
- utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}"
+ )
break
comments = comments_res["comments"]
if callback:
@@ -434,16 +458,10 @@ class XiaoHongShuClient(AbstractApiClient):
eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
"""
uri = f"/user/profile/{user_id}"
- html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
- match = re.search(r"", html)[0].replace("undefined", '""')
-
- if state != "{}":
- note_dict = transform_json_keys(state)
- return note_dict["note"]["note_detail_map"][note_id]["note"]
- return {}
-
- try:
- return get_note_dict(html)
- except:
- return None
+ return self._extractor.extract_note_detail_from_html(note_id, html)
diff --git a/media_platform/xhs/extractor.py b/media_platform/xhs/extractor.py
new file mode 100644
index 0000000..b8d7540
--- /dev/null
+++ b/media_platform/xhs/extractor.py
@@ -0,0 +1,60 @@
+# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import json
+import re
+from typing import Dict, Optional
+
+import humps
+
+
+class XiaoHongShuExtractor:
+ def __init__(self):
+ pass
+
+ def extract_note_detail_from_html(self, note_id: str, html: str) -> Optional[Dict]:
+ """从html中提取笔记详情
+
+ Args:
+ html (str): html字符串
+
+ Returns:
+ Dict: 笔记详情字典
+ """
+ if "noteDetailMap" not in html:
+ # 这种情况要么是出了验证码了,要么是笔记不存在
+ return None
+
+ state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[
+ 0
+ ].replace("undefined", '""')
+ if state != "{}":
+ note_dict = humps.decamelize(json.loads(state))
+ return note_dict["note"]["note_detail_map"][note_id]["note"]
+ return None
+
+ def extract_creator_info_from_html(self, html: str) -> Optional[Dict]:
+ """从html中提取用户信息
+
+ Args:
+ html (str): html字符串
+
+ Returns:
+ Dict: 用户信息字典
+ """
+ match = re.search(
+ r"