diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index f1df0de..c510ee8 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -24,7 +24,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page -from tenacity import retry, stop_after_attempt, wait_fixed +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_not_exception_type import config from base.base_crawler import AbstractApiClient @@ -34,7 +34,7 @@ from tools import utils if TYPE_CHECKING: from proxy.proxy_ip_pool import ProxyIpPool -from .exception import DataFetchError, IPBlockError +from .exception import DataFetchError, IPBlockError, NoteNotFoundError from .field import SearchNoteType, SearchSortType from .help import get_search_id from .extractor import XiaoHongShuExtractor @@ -60,6 +60,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): self._domain = "https://www.xiaohongshu.com" self.IP_ERROR_STR = "Network connection error, please check network settings or restart" self.IP_ERROR_CODE = 300012 + self.NOTE_NOT_FOUND_CODE = -510000 self.NOTE_ABNORMAL_STR = "Note status abnormal, please check later" self.NOTE_ABNORMAL_CODE = -510001 self.playwright_page = playwright_page @@ -109,7 +110,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): self.headers.update(headers) return self.headers - @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1), retry=retry_if_not_exception_type(NoteNotFoundError)) async def request(self, method, url, **kwargs) -> Union[str, Any]: """ Wrapper for httpx common request method, processes request response @@ -144,6 +145,8 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): return data.get("data", data.get("success", {})) elif data["code"] == self.IP_ERROR_CODE: raise IPBlockError(self.IP_ERROR_STR) + elif data["code"] in (self.NOTE_NOT_FOUND_CODE, self.NOTE_ABNORMAL_CODE): + raise NoteNotFoundError(f"Note not found or abnormal, code: {data['code']}") else: err_msg = data.get("msg", None) or f"{response.text}" raise DataFetchError(err_msg) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 7047468..f797e79 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -42,7 +42,7 @@ from tools.cdp_browser import CDPBrowserManager from var import crawler_type_var, source_keyword_var from .client import XiaoHongShuClient -from .exception import DataFetchError +from .exception import DataFetchError, NoteNotFoundError from .field import SearchSortType from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id from .login import XiaoHongShuLogin @@ -308,6 +308,9 @@ class XiaoHongShuCrawler(AbstractCrawler): return note_detail + except NoteNotFoundError as ex: + utils.logger.warning(f"[XiaoHongShuCrawler.get_note_detail_async_task] Note not found: {note_id}, {ex}") + return None except DataFetchError as ex: utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}") return None diff --git a/media_platform/xhs/exception.py b/media_platform/xhs/exception.py index 6f954d6..a956d93 100644 --- a/media_platform/xhs/exception.py +++ b/media_platform/xhs/exception.py @@ -27,3 +27,7 @@ class DataFetchError(RequestError): class IPBlockError(RequestError): """fetch so fast that the server block us ip""" + + +class NoteNotFoundError(RequestError): + """Note does not exist or is abnormal"""