fix: xhs帖子详情问题更新

This commit is contained in:
Relakkes
2024-10-20 00:59:08 +08:00
parent 9fe3e47b0f
commit 03e393949a
6 changed files with 85 additions and 36 deletions

View File

@@ -99,6 +99,13 @@ class XiaoHongShuClient(AbstractApiClient):
**kwargs
)
if response.status_code == 471 or response.status_code == 461:
# someday someone maybe will bypass captcha
verify_type = response.headers['Verifytype']
verify_uuid = response.headers['Verifyuuid']
raise Exception(
f"出现验证码请求失败Verifytype: {verify_type}Verifyuuid: {verify_uuid}, Response: {response}")
if return_response:
return response.text
data: Dict = response.json()
@@ -228,8 +235,8 @@ class XiaoHongShuClient(AbstractApiClient):
"source_note_id": note_id,
"image_formats": ["jpg", "webp", "avif"],
"extra": {"need_body_topic": 1},
# "xsec_source": xsec_source,
# "xsec_token": xsec_token
"xsec_source": xsec_source,
"xsec_token": xsec_token
}
uri = "/api/sns/web/v1/feed"
res = await self.post(uri, data)
@@ -454,13 +461,15 @@ class XiaoHongShuClient(AbstractApiClient):
return await self.post(uri, data=data, return_response=True)
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def get_note_by_id_from_html(self, note_id: str):
async def get_note_by_id_from_html(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict:
"""
通过解析网页版的笔记详情页HTML获取笔记详情, 该接口可能会出现失败的情况这里尝试重试3次
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
thanks for ReaJason
Args:
note_id:
xsec_source:
xsec_token:
Returns:
@@ -488,7 +497,7 @@ class XiaoHongShuClient(AbstractApiClient):
dict_new[new_key] = value
return dict_new
url = "https://www.xiaohongshu.com/explore/" + note_id
url = "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}"
html = await self.request(method="GET", url=url, return_response=True, headers=self.headers)
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
if state != "{}":

View File

@@ -21,6 +21,7 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from model.m_xiaohongshu import NoteUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
from tools import utils
@@ -29,6 +30,7 @@ from var import crawler_type_var, source_keyword_var
from .client import XiaoHongShuClient
from .exception import DataFetchError
from .field import SearchSortType
from .help import parse_note_info_from_note_url
from .login import XiaoHongShuLogin
@@ -191,48 +193,40 @@ class XiaoHongShuCrawler(AbstractCrawler):
await xhs_store.update_xhs_note(note_detail)
async def get_specified_notes(self):
"""Get the information and comments of the specified post"""
"""
Get the information and comments of the specified post
must be specified note_id, xsec_source, xsec_token⚠
Returns:
async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict:
async with semaphore:
try:
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
if not _note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")
return {}
return _note_detail
except DataFetchError as ex:
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}")
return {}
except KeyError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}")
return {}
except RetryError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}")
get_note_detail_task_list = [
get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for
note_id in config.XHS_SPECIFIED_ID_LIST
]
"""
get_note_detail_task_list = []
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
crawler_task = self.get_note_detail_async_task(
note_id=note_url_info.note_id,
xsec_source=note_url_info.xsec_source,
xsec_token=note_url_info.xsec_token,
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
)
get_note_detail_task_list.append(crawler_task)
need_get_comment_note_ids = []
note_details = await asyncio.gather(*get_note_detail_task_list)
for note_detail in note_details:
if note_detail:
need_get_comment_note_ids.append(note_detail.get("note_id"))
need_get_comment_note_ids.append(note_detail.get("note_id", ""))
await xhs_store.update_xhs_note(note_detail)
await self.batch_get_note_comments(need_get_comment_note_ids)
async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
Optional[Dict]:
"""Get note detail"""
async with semaphore:
try:
# note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token)
# note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
if not note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")

View File

@@ -15,6 +15,9 @@ import random
import time
import urllib.parse
from model.m_xiaohongshu import NoteUrlInfo
from tools.crawler_util import extract_url_params_to_dict
def sign(a1="", b1="", x_s="", x_t=""):
"""
@@ -288,6 +291,21 @@ def get_trace_id(img_url: str):
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
"""
从小红书笔记url中解析出笔记信息
Args:
url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
Returns:
"""
note_id = url.split("/")[-1].split("?")[0]
params = extract_url_params_to_dict(url)
xsec_token = params.get("xsec_token", "")
xsec_source = params.get("xsec_source", "")
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
if __name__ == '__main__':
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
# 获取一个图片地址在多个cdn下的url地址