fix: xhs帖子详情问题更新

This commit is contained in:
Relakkes
2024-10-20 00:59:08 +08:00
parent 9fe3e47b0f
commit 03e393949a
6 changed files with 85 additions and 36 deletions

View File

@@ -21,6 +21,7 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from model.m_xiaohongshu import NoteUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
from tools import utils
@@ -29,6 +30,7 @@ from var import crawler_type_var, source_keyword_var
from .client import XiaoHongShuClient
from .exception import DataFetchError
from .field import SearchSortType
from .help import parse_note_info_from_note_url
from .login import XiaoHongShuLogin
@@ -191,48 +193,40 @@ class XiaoHongShuCrawler(AbstractCrawler):
await xhs_store.update_xhs_note(note_detail)
async def get_specified_notes(self):
"""Get the information and comments of the specified post"""
"""
Get the information and comments of the specified post
must be specified note_id, xsec_source, xsec_token⚠
Returns:
async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict:
async with semaphore:
try:
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
if not _note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")
return {}
return _note_detail
except DataFetchError as ex:
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}")
return {}
except KeyError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}")
return {}
except RetryError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}")
get_note_detail_task_list = [
get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for
note_id in config.XHS_SPECIFIED_ID_LIST
]
"""
get_note_detail_task_list = []
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
crawler_task = self.get_note_detail_async_task(
note_id=note_url_info.note_id,
xsec_source=note_url_info.xsec_source,
xsec_token=note_url_info.xsec_token,
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
)
get_note_detail_task_list.append(crawler_task)
need_get_comment_note_ids = []
note_details = await asyncio.gather(*get_note_detail_task_list)
for note_detail in note_details:
if note_detail:
need_get_comment_note_ids.append(note_detail.get("note_id"))
need_get_comment_note_ids.append(note_detail.get("note_id", ""))
await xhs_store.update_xhs_note(note_detail)
await self.batch_get_note_comments(need_get_comment_note_ids)
async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
Optional[Dict]:
"""Get note detail"""
async with semaphore:
try:
# note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token)
# note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
if not note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")