diff --git a/config/base_config.py b/config/base_config.py index c97a3c5..7302238 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -39,7 +39,7 @@ SAVE_LOGIN_STATE = True # 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力 # 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制 # 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险 -ENABLE_CDP_MODE = False +ENABLE_CDP_MODE = True # CDP调试端口,用于与浏览器通信 # 如果端口被占用,系统会自动尝试下一个可用端口 diff --git a/config/xhs_config.py b/config/xhs_config.py index bc2d79b..485277a 100644 --- a/config/xhs_config.py +++ b/config/xhs_config.py @@ -15,9 +15,6 @@ # 排序方式,具体的枚举值在media_platform/xhs/field.py中 SORT_TYPE = "popularity_descending" -# 用户代理,xhs自定义User-Agent -UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0" - # 指定笔记URL列表, 必须要携带xsec_token参数 XHS_SPECIFIED_NOTE_URL_LIST = [ "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search" diff --git a/main.py b/main.py index ec95103..c074c7d 100644 --- a/main.py +++ b/main.py @@ -67,7 +67,8 @@ async def main(): def cleanup(): if crawler: - asyncio.run(crawler.close()) + # asyncio.run(crawler.close()) + pass if config.SAVE_DATA_OPTION in ["db", "sqlite"]: asyncio.run(db.close()) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 01a833e..322a7f3 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -95,7 +95,6 @@ class XiaoHongShuClient(AbstractApiClient): """ # return response.text return_response = kwargs.pop("return_response", False) - async with httpx.AsyncClient(proxies=self.proxies) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) @@ -103,9 +102,9 @@ class XiaoHongShuClient(AbstractApiClient): # someday someone maybe will bypass captcha verify_type = response.headers["Verifytype"] verify_uuid = response.headers["Verifyuuid"] - raise Exception( - f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}" - ) + msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}" + utils.logger.error(msg) + raise Exception(msg) if return_response: return response.text diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index c7ee556..ed64591 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://www.xiaohongshu.com" # self.user_agent = utils.get_user_agent() - self.user_agent = ( - config.UA - if config.UA - else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" - ) + self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" self.cdp_manager = None async def start(self) -> None: @@ -91,17 +87,6 @@ class XiaoHongShuCrawler(AbstractCrawler): ) # stealth.min.js is a js script to prevent the website from detecting the crawler. await self.browser_context.add_init_script(path="libs/stealth.min.js") - # add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage - await self.browser_context.add_cookies( - [ - { - "name": "webId", - "value": "xxx123", # any value - "domain": ".xiaohongshu.com", - "path": "/", - } - ] - ) self.context_page = await self.browser_context.new_page() await self.context_page.goto(self.index_url) @@ -152,7 +137,7 @@ class XiaoHongShuCrawler(AbstractCrawler): page = 1 search_id = get_search_id() while ( - page - start_page + 1 + page - start_page + 1 ) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: if page < start_page: utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}") @@ -294,11 +279,11 @@ class XiaoHongShuCrawler(AbstractCrawler): await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens) async def get_note_detail_async_task( - self, - note_id: str, - xsec_source: str, - xsec_token: str, - semaphore: asyncio.Semaphore, + self, + note_id: str, + xsec_source: str, + xsec_token: str, + semaphore: asyncio.Semaphore, ) -> Optional[Dict]: """Get note detail @@ -311,47 +296,31 @@ class XiaoHongShuCrawler(AbstractCrawler): Returns: Dict: note detail """ - note_detail_from_html, note_detail_from_api = None, None + note_detail = None async with semaphore: - # When proxy is not enabled, increase the crawling interval - if config.ENABLE_IP_PROXY: - crawl_interval = random.random() - else: - crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) try: utils.logger.info( f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}" ) - # 尝试直接获取网页版笔记详情,携带cookie - note_detail_from_html: Optional[Dict] = ( - await self.xhs_client.get_note_by_id_from_html( - note_id, xsec_source, xsec_token, enable_cookie=True + + try: + note_detail = await self.xhs_client.get_note_by_id( + note_id, xsec_source, xsec_token ) + except RetryError as e: + pass + + if not note_detail: + note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, + enable_cookie=False) + if not note_detail: + raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") + + note_detail.update( + {"xsec_token": xsec_token, "xsec_source": xsec_source} ) - time.sleep(crawl_interval) - if not note_detail_from_html: - # 如果网页版笔记详情获取失败,则尝试不使用cookie获取 - note_detail_from_html = ( - await self.xhs_client.get_note_by_id_from_html( - note_id, xsec_source, xsec_token, enable_cookie=False - ) - ) - utils.logger.error( - f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}" - ) - if not note_detail_from_html: - # 如果网页版笔记详情获取失败,则尝试API获取 - note_detail_from_api: Optional[Dict] = ( - await self.xhs_client.get_note_by_id( - note_id, xsec_source, xsec_token - ) - ) - note_detail = note_detail_from_html or note_detail_from_api - if note_detail: - note_detail.update( - {"xsec_token": xsec_token, "xsec_source": xsec_source} - ) - return note_detail + return note_detail + except DataFetchError as ex: utils.logger.error( f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}" @@ -364,7 +333,7 @@ class XiaoHongShuCrawler(AbstractCrawler): return None async def batch_get_note_comments( - self, note_list: List[str], xsec_tokens: List[str] + self, note_list: List[str], xsec_tokens: List[str] ): """Batch get note comments""" if not config.ENABLE_GET_COMMENTS: @@ -389,7 +358,7 @@ class XiaoHongShuCrawler(AbstractCrawler): await asyncio.gather(*task_list) async def get_comments( - self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore + self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore ): """Get note comments with keyword filtering and quantity limitation""" async with semaphore: @@ -411,7 +380,7 @@ class XiaoHongShuCrawler(AbstractCrawler): @staticmethod def format_proxy_info( - ip_proxy_info: IpInfoModel, + ip_proxy_info: IpInfoModel, ) -> Tuple[Optional[Dict], Optional[Dict]]: """format proxy info for playwright and httpx""" playwright_proxy = { @@ -447,11 +416,11 @@ class XiaoHongShuCrawler(AbstractCrawler): return xhs_client_obj async def launch_browser( - self, - chromium: BrowserType, - playwright_proxy: Optional[Dict], - user_agent: Optional[str], - headless: bool = True, + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, ) -> BrowserContext: """Launch browser and create browser context""" utils.logger.info( @@ -480,11 +449,11 @@ class XiaoHongShuCrawler(AbstractCrawler): return browser_context async def launch_browser_with_cdp( - self, - playwright: Playwright, - playwright_proxy: Optional[Dict], - user_agent: Optional[str], - headless: bool = True, + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, ) -> BrowserContext: """ 使用CDP模式启动浏览器