mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-04 21:20:47 +08:00
feat: #823
This commit is contained in:
@@ -24,7 +24,7 @@ from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_not_exception_type
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
@@ -34,7 +34,7 @@ from tools import utils
|
||||
if TYPE_CHECKING:
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
|
||||
from .exception import DataFetchError, IPBlockError
|
||||
from .exception import DataFetchError, IPBlockError, NoteNotFoundError
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import get_search_id
|
||||
from .extractor import XiaoHongShuExtractor
|
||||
@@ -60,6 +60,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self._domain = "https://www.xiaohongshu.com"
|
||||
self.IP_ERROR_STR = "Network connection error, please check network settings or restart"
|
||||
self.IP_ERROR_CODE = 300012
|
||||
self.NOTE_NOT_FOUND_CODE = -510000
|
||||
self.NOTE_ABNORMAL_STR = "Note status abnormal, please check later"
|
||||
self.NOTE_ABNORMAL_CODE = -510001
|
||||
self.playwright_page = playwright_page
|
||||
@@ -109,7 +110,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.headers.update(headers)
|
||||
return self.headers
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1), retry=retry_if_not_exception_type(NoteNotFoundError))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
Wrapper for httpx common request method, processes request response
|
||||
@@ -144,6 +145,8 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
return data.get("data", data.get("success", {}))
|
||||
elif data["code"] == self.IP_ERROR_CODE:
|
||||
raise IPBlockError(self.IP_ERROR_STR)
|
||||
elif data["code"] in (self.NOTE_NOT_FOUND_CODE, self.NOTE_ABNORMAL_CODE):
|
||||
raise NoteNotFoundError(f"Note not found or abnormal, code: {data['code']}")
|
||||
else:
|
||||
err_msg = data.get("msg", None) or f"{response.text}"
|
||||
raise DataFetchError(err_msg)
|
||||
|
||||
@@ -42,7 +42,7 @@ from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import XiaoHongShuClient
|
||||
from .exception import DataFetchError
|
||||
from .exception import DataFetchError, NoteNotFoundError
|
||||
from .field import SearchSortType
|
||||
from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id
|
||||
from .login import XiaoHongShuLogin
|
||||
@@ -308,6 +308,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
return note_detail
|
||||
|
||||
except NoteNotFoundError as ex:
|
||||
utils.logger.warning(f"[XiaoHongShuCrawler.get_note_detail_async_task] Note not found: {note_id}, {ex}")
|
||||
return None
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}")
|
||||
return None
|
||||
|
||||
@@ -27,3 +27,7 @@ class DataFetchError(RequestError):
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
|
||||
|
||||
class NoteNotFoundError(RequestError):
|
||||
"""Note does not exist or is abnormal"""
|
||||
|
||||
Reference in New Issue
Block a user