diff --git a/README.md b/README.md index b33cd5b..e7b075f 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,6 @@ | 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -
-🔗 🚀 MediaCrawlerPro 重磅发布!更多的功能,更好的架构设计! ### 🚀 MediaCrawlerPro 重磅发布! @@ -77,7 +75,7 @@ - [ ] **基于自媒体平台的AI Agent正在开发中 🚀🚀** 点击查看:[MediaCrawlerPro 项目主页](https://github.com/MediaCrawlerPro) 更多介绍 -
+ ## 🚀 快速开始 @@ -216,40 +214,11 @@ uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlit uv run main.py --platform xhs --lt qrcode --type search --save_data_option db ``` ---- [🚀 MediaCrawlerPro 重磅发布 🚀!更多的功能,更好的架构设计!](https://github.com/MediaCrawlerPro) -## 🤝 社区与支持 - -### 💬 交流群组 -- **微信交流群**:[点击加入](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html) - -### 📚 文档与教程 -- **在线文档**:[MediaCrawler 完整文档](https://nanmicoder.github.io/MediaCrawler/) -- **爬虫教程**:[CrawlerTutorial 免费教程](https://github.com/NanmiCoder/CrawlerTutorial) - - -# 其他常见问题可以查看在线文档 -> -> 在线文档包含使用方法、常见问题、加入项目交流群等。 -> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/) -> - -# 作者提供的知识服务 -> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。 - -[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html) - - --- -## ⭐ Star 趋势图 - -如果这个项目对您有帮助,请给个 ⭐ Star 支持一下,让更多的人看到 MediaCrawler! - -[![Star History Chart](https://api.star-history.com/svg?repos=NanmiCoder/MediaCrawler&type=Date)](https://star-history.com/#NanmiCoder/MediaCrawler&Date) - ### 💰 赞助商展示 @@ -288,6 +257,38 @@ Thordata是全球代理IP解决方案提供商,支持大规模采集公共网 - 邮箱:`relakkes@gmail.com` +## 🤝 社区与支持 + +### 💬 交流群组 +- **微信交流群**:[点击加入](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html) + +### 📚 文档与教程 +- **在线文档**:[MediaCrawler 完整文档](https://nanmicoder.github.io/MediaCrawler/) +- **爬虫教程**:[CrawlerTutorial 免费教程](https://github.com/NanmiCoder/CrawlerTutorial) + + +# 其他常见问题可以查看在线文档 +> +> 在线文档包含使用方法、常见问题、加入项目交流群等。 +> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/) +> + +# 作者提供的知识服务 +> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。 + +[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html) + + +--- + +## ⭐ Star 趋势图 + +如果这个项目对您有帮助,请给个 ⭐ Star 支持一下,让更多的人看到 MediaCrawler! + +[![Star History Chart](https://api.star-history.com/svg?repos=NanmiCoder/MediaCrawler&type=Date)](https://star-history.com/#NanmiCoder/MediaCrawler&Date) + + + ## 📚 参考 - **小红书客户端**:[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 982373a..f5bd02e 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -26,6 +26,7 @@ from html import unescape from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType from .help import get_search_id, sign +from .extractor import XiaoHongShuExtractor class XiaoHongShuClient(AbstractApiClient): @@ -50,6 +51,7 @@ class XiaoHongShuClient(AbstractApiClient): self.NOTE_ABNORMAL_CODE = -510001 self.playwright_page = playwright_page self.cookie_dict = cookie_dict + self._extractor = XiaoHongShuExtractor() async def _pre_headers(self, url: str, data=None) -> Dict: """ @@ -61,7 +63,9 @@ class XiaoHongShuClient(AbstractApiClient): Returns: """ - encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data]) + encrypt_params = await self.playwright_page.evaluate( + "([url, data]) => window._webmsxyw(url,data)", [url, data] + ) local_storage = await self.playwright_page.evaluate("() => window.localStorage") signs = sign( a1=self.cookie_dict.get("a1", ""), @@ -128,7 +132,9 @@ class XiaoHongShuClient(AbstractApiClient): if isinstance(params, dict): final_uri = f"{uri}?" f"{urlencode(params)}" headers = await self._pre_headers(final_uri) - return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) + return await self.request( + method="GET", url=f"{self._host}{final_uri}", headers=headers + ) async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ @@ -156,12 +162,18 @@ class XiaoHongShuClient(AbstractApiClient): response = await client.request("GET", url, timeout=self.timeout) response.raise_for_status() if not response.reason_phrase == "OK": - utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") + utils.logger.error( + f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}" + ) return None else: return response.content - except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx - utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试 + except ( + httpx.HTTPError + ) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx + utils.logger.error( + f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}" + ) # 保留原始异常类型名称,以便开发者调试 return None async def pong(self) -> bool: @@ -178,7 +190,9 @@ class XiaoHongShuClient(AbstractApiClient): if note_card.get("items"): ping_flag = True except Exception as e: - utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...") + utils.logger.error( + f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again..." + ) ping_flag = False return ping_flag @@ -249,9 +263,7 @@ class XiaoHongShuClient(AbstractApiClient): data = { "source_note_id": note_id, "image_formats": ["jpg", "webp", "avif"], - "extra": { - "need_body_topic": 1 - }, + "extra": {"need_body_topic": 1}, "xsec_source": xsec_source, "xsec_token": xsec_token, } @@ -261,7 +273,9 @@ class XiaoHongShuClient(AbstractApiClient): res_dict: Dict = res["items"][0]["note_card"] return res_dict # 爬取频繁了可能会出现有的笔记能有结果有的没有 - utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}") + utils.logger.error( + f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}" + ) return dict() async def get_note_comments( @@ -345,15 +359,19 @@ class XiaoHongShuClient(AbstractApiClient): comments_has_more = True comments_cursor = "" while comments_has_more and len(result) < max_count: - comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor) + comments_res = await self.get_note_comments( + note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor + ) comments_has_more = comments_res.get("has_more", False) comments_cursor = comments_res.get("cursor", "") if "comments" not in comments_res: - utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") + utils.logger.info( + f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}" + ) break comments = comments_res["comments"] if len(result) + len(comments) > max_count: - comments = comments[:max_count - len(result)] + comments = comments[: max_count - len(result)] if callback: await callback(note_id, comments) await asyncio.sleep(crawl_interval) @@ -386,7 +404,9 @@ class XiaoHongShuClient(AbstractApiClient): """ if not config.ENABLE_GET_SUB_COMMENTS: - utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") + utils.logger.info( + f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled" + ) return [] result = [] @@ -413,12 +433,16 @@ class XiaoHongShuClient(AbstractApiClient): ) if comments_res is None: - utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}") + utils.logger.info( + f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}" + ) continue sub_comment_has_more = comments_res.get("has_more", False) sub_comment_cursor = comments_res.get("cursor", "") if "comments" not in comments_res: - utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}") + utils.logger.info( + f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}" + ) break comments = comments_res["comments"] if callback: @@ -434,16 +458,10 @@ class XiaoHongShuClient(AbstractApiClient): eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217 """ uri = f"/user/profile/{user_id}" - html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers) - match = re.search(r"", html)[0].replace("undefined", '""') - - if state != "{}": - note_dict = transform_json_keys(state) - return note_dict["note"]["note_detail_map"][note_id]["note"] - return {} - - try: - return get_note_dict(html) - except: - return None + return self._extractor.extract_note_detail_from_html(note_id, html) diff --git a/media_platform/xhs/extractor.py b/media_platform/xhs/extractor.py new file mode 100644 index 0000000..b8d7540 --- /dev/null +++ b/media_platform/xhs/extractor.py @@ -0,0 +1,60 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import json +import re +from typing import Dict, Optional + +import humps + + +class XiaoHongShuExtractor: + def __init__(self): + pass + + def extract_note_detail_from_html(self, note_id: str, html: str) -> Optional[Dict]: + """从html中提取笔记详情 + + Args: + html (str): html字符串 + + Returns: + Dict: 笔记详情字典 + """ + if "noteDetailMap" not in html: + # 这种情况要么是出了验证码了,要么是笔记不存在 + return None + + state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[ + 0 + ].replace("undefined", '""') + if state != "{}": + note_dict = humps.decamelize(json.loads(state)) + return note_dict["note"]["note_detail_map"][note_id]["note"] + return None + + def extract_creator_info_from_html(self, html: str) -> Optional[Dict]: + """从html中提取用户信息 + + Args: + html (str): html字符串 + + Returns: + Dict: 用户信息字典 + """ + match = re.search( + r"