refactor: xhs update

This commit is contained in:
程序员阿江(Relakkes)
2025-07-21 21:26:16 +08:00
parent 26a43358cb
commit a4d9aaa34a
5 changed files with 44 additions and 78 deletions

View File

@@ -39,7 +39,7 @@ SAVE_LOGIN_STATE = True
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取提供更好的反检测能力
# 启用后将自动检测并启动用户的Chrome/Edge浏览器通过CDP协议进行控制
# 这种方式使用真实的浏览器环境包括用户的扩展、Cookie和设置大大降低被检测的风险
ENABLE_CDP_MODE = False
ENABLE_CDP_MODE = True
# CDP调试端口用于与浏览器通信
# 如果端口被占用,系统会自动尝试下一个可用端口

View File

@@ -15,9 +15,6 @@
# 排序方式具体的枚举值在media_platform/xhs/field.py中
SORT_TYPE = "popularity_descending"
# 用户代理xhs自定义User-Agent
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
# 指定笔记URL列表, 必须要携带xsec_token参数
XHS_SPECIFIED_NOTE_URL_LIST = [
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"

View File

@@ -67,7 +67,8 @@ async def main():
def cleanup():
if crawler:
asyncio.run(crawler.close())
# asyncio.run(crawler.close())
pass
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
asyncio.run(db.close())

View File

@@ -95,7 +95,6 @@ class XiaoHongShuClient(AbstractApiClient):
"""
# return response.text
return_response = kwargs.pop("return_response", False)
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
@@ -103,9 +102,9 @@ class XiaoHongShuClient(AbstractApiClient):
# someday someone maybe will bypass captcha
verify_type = response.headers["Verifytype"]
verify_uuid = response.headers["Verifyuuid"]
raise Exception(
f"出现验证码请求失败Verifytype: {verify_type}Verifyuuid: {verify_uuid}, Response: {response}"
)
msg = f"出现验证码请求失败Verifytype: {verify_type}Verifyuuid: {verify_uuid}, Response: {response}"
utils.logger.error(msg)
raise Exception(msg)
if return_response:
return response.text

View File

@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
def __init__(self) -> None:
self.index_url = "https://www.xiaohongshu.com"
# self.user_agent = utils.get_user_agent()
self.user_agent = (
config.UA
if config.UA
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
)
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
self.cdp_manager = None
async def start(self) -> None:
@@ -91,17 +87,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
# add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
await self.browser_context.add_cookies(
[
{
"name": "webId",
"value": "xxx123", # any value
"domain": ".xiaohongshu.com",
"path": "/",
}
]
)
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
@@ -152,7 +137,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
page = 1
search_id = get_search_id()
while (
page - start_page + 1
page - start_page + 1
) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
@@ -294,11 +279,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
async def get_note_detail_async_task(
self,
note_id: str,
xsec_source: str,
xsec_token: str,
semaphore: asyncio.Semaphore,
self,
note_id: str,
xsec_source: str,
xsec_token: str,
semaphore: asyncio.Semaphore,
) -> Optional[Dict]:
"""Get note detail
@@ -311,47 +296,31 @@ class XiaoHongShuCrawler(AbstractCrawler):
Returns:
Dict: note detail
"""
note_detail_from_html, note_detail_from_api = None, None
note_detail = None
async with semaphore:
# When proxy is not enabled, increase the crawling interval
if config.ENABLE_IP_PROXY:
crawl_interval = random.random()
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
try:
utils.logger.info(
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
)
# 尝试直接获取网页版笔记详情携带cookie
note_detail_from_html: Optional[Dict] = (
await self.xhs_client.get_note_by_id_from_html(
note_id, xsec_source, xsec_token, enable_cookie=True
try:
note_detail = await self.xhs_client.get_note_by_id(
note_id, xsec_source, xsec_token
)
except RetryError as e:
pass
if not note_detail:
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
enable_cookie=False)
if not note_detail:
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
note_detail.update(
{"xsec_token": xsec_token, "xsec_source": xsec_source}
)
time.sleep(crawl_interval)
if not note_detail_from_html:
# 如果网页版笔记详情获取失败则尝试不使用cookie获取
note_detail_from_html = (
await self.xhs_client.get_note_by_id_from_html(
note_id, xsec_source, xsec_token, enable_cookie=False
)
)
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}"
)
if not note_detail_from_html:
# 如果网页版笔记详情获取失败则尝试API获取
note_detail_from_api: Optional[Dict] = (
await self.xhs_client.get_note_by_id(
note_id, xsec_source, xsec_token
)
)
note_detail = note_detail_from_html or note_detail_from_api
if note_detail:
note_detail.update(
{"xsec_token": xsec_token, "xsec_source": xsec_source}
)
return note_detail
return note_detail
except DataFetchError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
@@ -364,7 +333,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
return None
async def batch_get_note_comments(
self, note_list: List[str], xsec_tokens: List[str]
self, note_list: List[str], xsec_tokens: List[str]
):
"""Batch get note comments"""
if not config.ENABLE_GET_COMMENTS:
@@ -389,7 +358,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
await asyncio.gather(*task_list)
async def get_comments(
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
):
"""Get note comments with keyword filtering and quantity limitation"""
async with semaphore:
@@ -411,7 +380,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
@@ -447,11 +416,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
return xhs_client_obj
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info(
@@ -480,11 +449,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
return browser_context
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器