mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-24 09:00:50 +08:00
refactor: xhs update
This commit is contained in:
@@ -39,7 +39,7 @@ SAVE_LOGIN_STATE = True
|
||||
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
|
||||
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
|
||||
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
|
||||
ENABLE_CDP_MODE = False
|
||||
ENABLE_CDP_MODE = True
|
||||
|
||||
# CDP调试端口,用于与浏览器通信
|
||||
# 如果端口被占用,系统会自动尝试下一个可用端口
|
||||
|
||||
@@ -15,9 +15,6 @@
|
||||
# 排序方式,具体的枚举值在media_platform/xhs/field.py中
|
||||
SORT_TYPE = "popularity_descending"
|
||||
|
||||
# 用户代理,xhs自定义User-Agent
|
||||
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
|
||||
|
||||
# 指定笔记URL列表, 必须要携带xsec_token参数
|
||||
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
|
||||
3
main.py
3
main.py
@@ -67,7 +67,8 @@ async def main():
|
||||
|
||||
def cleanup():
|
||||
if crawler:
|
||||
asyncio.run(crawler.close())
|
||||
# asyncio.run(crawler.close())
|
||||
pass
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
||||
asyncio.run(db.close())
|
||||
|
||||
|
||||
@@ -95,7 +95,6 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
"""
|
||||
# return response.text
|
||||
return_response = kwargs.pop("return_response", False)
|
||||
|
||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
@@ -103,9 +102,9 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
# someday someone maybe will bypass captcha
|
||||
verify_type = response.headers["Verifytype"]
|
||||
verify_uuid = response.headers["Verifyuuid"]
|
||||
raise Exception(
|
||||
f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
||||
)
|
||||
msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
||||
utils.logger.error(msg)
|
||||
raise Exception(msg)
|
||||
|
||||
if return_response:
|
||||
return response.text
|
||||
|
||||
@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.xiaohongshu.com"
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = (
|
||||
config.UA
|
||||
if config.UA
|
||||
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||
)
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
@@ -91,17 +87,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
# add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
|
||||
await self.browser_context.add_cookies(
|
||||
[
|
||||
{
|
||||
"name": "webId",
|
||||
"value": "xxx123", # any value
|
||||
"domain": ".xiaohongshu.com",
|
||||
"path": "/",
|
||||
}
|
||||
]
|
||||
)
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
@@ -152,7 +137,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
page = 1
|
||||
search_id = get_search_id()
|
||||
while (
|
||||
page - start_page + 1
|
||||
page - start_page + 1
|
||||
) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
||||
@@ -294,11 +279,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
|
||||
|
||||
async def get_note_detail_async_task(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> Optional[Dict]:
|
||||
"""Get note detail
|
||||
|
||||
@@ -311,47 +296,31 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
Returns:
|
||||
Dict: note detail
|
||||
"""
|
||||
note_detail_from_html, note_detail_from_api = None, None
|
||||
note_detail = None
|
||||
async with semaphore:
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
|
||||
)
|
||||
# 尝试直接获取网页版笔记详情,携带cookie
|
||||
note_detail_from_html: Optional[Dict] = (
|
||||
await self.xhs_client.get_note_by_id_from_html(
|
||||
note_id, xsec_source, xsec_token, enable_cookie=True
|
||||
|
||||
try:
|
||||
note_detail = await self.xhs_client.get_note_by_id(
|
||||
note_id, xsec_source, xsec_token
|
||||
)
|
||||
except RetryError as e:
|
||||
pass
|
||||
|
||||
if not note_detail:
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
|
||||
enable_cookie=False)
|
||||
if not note_detail:
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
|
||||
note_detail.update(
|
||||
{"xsec_token": xsec_token, "xsec_source": xsec_source}
|
||||
)
|
||||
time.sleep(crawl_interval)
|
||||
if not note_detail_from_html:
|
||||
# 如果网页版笔记详情获取失败,则尝试不使用cookie获取
|
||||
note_detail_from_html = (
|
||||
await self.xhs_client.get_note_by_id_from_html(
|
||||
note_id, xsec_source, xsec_token, enable_cookie=False
|
||||
)
|
||||
)
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}"
|
||||
)
|
||||
if not note_detail_from_html:
|
||||
# 如果网页版笔记详情获取失败,则尝试API获取
|
||||
note_detail_from_api: Optional[Dict] = (
|
||||
await self.xhs_client.get_note_by_id(
|
||||
note_id, xsec_source, xsec_token
|
||||
)
|
||||
)
|
||||
note_detail = note_detail_from_html or note_detail_from_api
|
||||
if note_detail:
|
||||
note_detail.update(
|
||||
{"xsec_token": xsec_token, "xsec_source": xsec_source}
|
||||
)
|
||||
return note_detail
|
||||
return note_detail
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
|
||||
@@ -364,7 +333,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(
|
||||
self, note_list: List[str], xsec_tokens: List[str]
|
||||
self, note_list: List[str], xsec_tokens: List[str]
|
||||
):
|
||||
"""Batch get note comments"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
@@ -389,7 +358,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(
|
||||
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
|
||||
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""Get note comments with keyword filtering and quantity limitation"""
|
||||
async with semaphore:
|
||||
@@ -411,7 +380,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
@staticmethod
|
||||
def format_proxy_info(
|
||||
ip_proxy_info: IpInfoModel,
|
||||
ip_proxy_info: IpInfoModel,
|
||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||
"""format proxy info for playwright and httpx"""
|
||||
playwright_proxy = {
|
||||
@@ -447,11 +416,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
return xhs_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info(
|
||||
@@ -480,11 +449,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
|
||||
Reference in New Issue
Block a user