fix: xhs指定笔记ID获取方式增加解析html方式,原来的由于xsec_token导致失效

This commit is contained in:
Relakkes
2024-08-11 22:37:10 +08:00
parent ec47c230a9
commit f371675d47
3 changed files with 100 additions and 24 deletions

View File

@@ -116,7 +116,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
break
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail(
self.get_note_detail_async_task(
note_id=post_item.get("id"),
xsec_source=post_item.get("xsec_source"),
xsec_token=post_item.get("xsec_token"),
@@ -163,7 +163,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail(
self.get_note_detail_async_task(
note_id=post_item.get("note_id"),
xsec_source=post_item.get("xsec_source"),
xsec_token=post_item.get("xsec_token"),
@@ -179,20 +179,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_specified_notes(self):
"""Get the information and comments of the specified post"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
fixed_xsec_token = "ABtXiOIX98byLlu-ju5dDq3tIc6uikcJrd3t7OYyqUbE4"
task_list = [
self.get_note_detail(note_id=note_id, xsec_source="pc_search", xsec_token=fixed_xsec_token,
semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST
]
note_details = await asyncio.gather(*task_list)
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
async def get_note_detail(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict:
async with semaphore:
try:
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
print("------------------------")
print(_note_detail)
print("------------------------")
if not _note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")
return {}
return _note_detail
except DataFetchError as ex:
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}")
return {}
except KeyError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}")
return {}
get_note_detail_task_list = [
get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for
note_id in config.XHS_SPECIFIED_ID_LIST
]
need_get_comment_note_ids = []
note_details = await asyncio.gather(*get_note_detail_task_list)
for note_detail in note_details:
if note_detail:
need_get_comment_note_ids.append(note_detail.get("note_id"))
await xhs_store.update_xhs_note(note_detail)
await self.batch_get_note_comments(need_get_comment_note_ids)
async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
Optional[Dict]:
"""Get note detail"""
async with semaphore:
@@ -200,16 +221,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
if not note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail] Get note detail error, note_id: {note_id}")
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")
return None
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
return note_detail
except DataFetchError as ex:
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}")
return None
async def batch_get_note_comments(self, note_list: List[str]):