mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 23:21:33 +08:00
feat: xhs support creator url link
This commit is contained in:
@@ -26,7 +26,7 @@ from tenacity import RetryError
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||
from model.m_xiaohongshu import NoteUrlInfo
|
||||
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import xhs as xhs_store
|
||||
from tools import utils
|
||||
@@ -36,7 +36,7 @@ from var import crawler_type_var, source_keyword_var
|
||||
from .client import XiaoHongShuClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchSortType
|
||||
from .help import parse_note_info_from_note_url, get_search_id
|
||||
from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id
|
||||
from .login import XiaoHongShuLogin
|
||||
|
||||
|
||||
@@ -174,11 +174,24 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""Get creator's notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
|
||||
for user_id in config.XHS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||
for creator_url in config.XHS_CREATOR_ID_LIST:
|
||||
try:
|
||||
# Parse creator URL to get user_id and security tokens
|
||||
creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Parse creator URL info: {creator_info}")
|
||||
user_id = creator_info.user_id
|
||||
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.xhs_client.get_creator_info(
|
||||
user_id=user_id,
|
||||
xsec_token=creator_info.xsec_token,
|
||||
xsec_source=creator_info.xsec_source
|
||||
)
|
||||
if createor_info:
|
||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] Failed to parse creator URL: {e}")
|
||||
continue
|
||||
|
||||
# Use fixed crawling interval
|
||||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||||
@@ -271,7 +284,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
try:
|
||||
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||
except RetryError as e:
|
||||
except RetryError:
|
||||
pass
|
||||
|
||||
if not note_detail:
|
||||
|
||||
Reference in New Issue
Block a user