mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-07 10:27:25 +08:00
refactor: xhs update
This commit is contained in:
@@ -39,7 +39,7 @@ SAVE_LOGIN_STATE = True
|
|||||||
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
|
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
|
||||||
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
|
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
|
||||||
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
|
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
|
||||||
ENABLE_CDP_MODE = False
|
ENABLE_CDP_MODE = True
|
||||||
|
|
||||||
# CDP调试端口,用于与浏览器通信
|
# CDP调试端口,用于与浏览器通信
|
||||||
# 如果端口被占用,系统会自动尝试下一个可用端口
|
# 如果端口被占用,系统会自动尝试下一个可用端口
|
||||||
|
|||||||
@@ -15,9 +15,6 @@
|
|||||||
# 排序方式,具体的枚举值在media_platform/xhs/field.py中
|
# 排序方式,具体的枚举值在media_platform/xhs/field.py中
|
||||||
SORT_TYPE = "popularity_descending"
|
SORT_TYPE = "popularity_descending"
|
||||||
|
|
||||||
# 用户代理,xhs自定义User-Agent
|
|
||||||
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
|
|
||||||
|
|
||||||
# 指定笔记URL列表, 必须要携带xsec_token参数
|
# 指定笔记URL列表, 必须要携带xsec_token参数
|
||||||
XHS_SPECIFIED_NOTE_URL_LIST = [
|
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||||
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||||
|
|||||||
3
main.py
3
main.py
@@ -67,7 +67,8 @@ async def main():
|
|||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
if crawler:
|
if crawler:
|
||||||
asyncio.run(crawler.close())
|
# asyncio.run(crawler.close())
|
||||||
|
pass
|
||||||
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
||||||
asyncio.run(db.close())
|
asyncio.run(db.close())
|
||||||
|
|
||||||
|
|||||||
@@ -95,7 +95,6 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
"""
|
"""
|
||||||
# return response.text
|
# return response.text
|
||||||
return_response = kwargs.pop("return_response", False)
|
return_response = kwargs.pop("return_response", False)
|
||||||
|
|
||||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
|
|
||||||
@@ -103,9 +102,9 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
# someday someone maybe will bypass captcha
|
# someday someone maybe will bypass captcha
|
||||||
verify_type = response.headers["Verifytype"]
|
verify_type = response.headers["Verifytype"]
|
||||||
verify_uuid = response.headers["Verifyuuid"]
|
verify_uuid = response.headers["Verifyuuid"]
|
||||||
raise Exception(
|
msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
||||||
f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
utils.logger.error(msg)
|
||||||
)
|
raise Exception(msg)
|
||||||
|
|
||||||
if return_response:
|
if return_response:
|
||||||
return response.text
|
return response.text
|
||||||
|
|||||||
@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
# self.user_agent = utils.get_user_agent()
|
# self.user_agent = utils.get_user_agent()
|
||||||
self.user_agent = (
|
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||||
config.UA
|
|
||||||
if config.UA
|
|
||||||
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
|
||||||
)
|
|
||||||
self.cdp_manager = None
|
self.cdp_manager = None
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
@@ -91,17 +87,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
)
|
)
|
||||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||||
# add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
|
|
||||||
await self.browser_context.add_cookies(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"name": "webId",
|
|
||||||
"value": "xxx123", # any value
|
|
||||||
"domain": ".xiaohongshu.com",
|
|
||||||
"path": "/",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
self.context_page = await self.browser_context.new_page()
|
self.context_page = await self.browser_context.new_page()
|
||||||
await self.context_page.goto(self.index_url)
|
await self.context_page.goto(self.index_url)
|
||||||
|
|
||||||
@@ -152,7 +137,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
page = 1
|
page = 1
|
||||||
search_id = get_search_id()
|
search_id = get_search_id()
|
||||||
while (
|
while (
|
||||||
page - start_page + 1
|
page - start_page + 1
|
||||||
) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
if page < start_page:
|
if page < start_page:
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
||||||
@@ -294,11 +279,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
|
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
|
||||||
|
|
||||||
async def get_note_detail_async_task(
|
async def get_note_detail_async_task(
|
||||||
self,
|
self,
|
||||||
note_id: str,
|
note_id: str,
|
||||||
xsec_source: str,
|
xsec_source: str,
|
||||||
xsec_token: str,
|
xsec_token: str,
|
||||||
semaphore: asyncio.Semaphore,
|
semaphore: asyncio.Semaphore,
|
||||||
) -> Optional[Dict]:
|
) -> Optional[Dict]:
|
||||||
"""Get note detail
|
"""Get note detail
|
||||||
|
|
||||||
@@ -311,47 +296,31 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
Returns:
|
Returns:
|
||||||
Dict: note detail
|
Dict: note detail
|
||||||
"""
|
"""
|
||||||
note_detail_from_html, note_detail_from_api = None, None
|
note_detail = None
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
# When proxy is not enabled, increase the crawling interval
|
|
||||||
if config.ENABLE_IP_PROXY:
|
|
||||||
crawl_interval = random.random()
|
|
||||||
else:
|
|
||||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
|
||||||
try:
|
try:
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
|
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
|
||||||
)
|
)
|
||||||
# 尝试直接获取网页版笔记详情,携带cookie
|
|
||||||
note_detail_from_html: Optional[Dict] = (
|
try:
|
||||||
await self.xhs_client.get_note_by_id_from_html(
|
note_detail = await self.xhs_client.get_note_by_id(
|
||||||
note_id, xsec_source, xsec_token, enable_cookie=True
|
note_id, xsec_source, xsec_token
|
||||||
)
|
)
|
||||||
|
except RetryError as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not note_detail:
|
||||||
|
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
|
||||||
|
enable_cookie=False)
|
||||||
|
if not note_detail:
|
||||||
|
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||||
|
|
||||||
|
note_detail.update(
|
||||||
|
{"xsec_token": xsec_token, "xsec_source": xsec_source}
|
||||||
)
|
)
|
||||||
time.sleep(crawl_interval)
|
return note_detail
|
||||||
if not note_detail_from_html:
|
|
||||||
# 如果网页版笔记详情获取失败,则尝试不使用cookie获取
|
|
||||||
note_detail_from_html = (
|
|
||||||
await self.xhs_client.get_note_by_id_from_html(
|
|
||||||
note_id, xsec_source, xsec_token, enable_cookie=False
|
|
||||||
)
|
|
||||||
)
|
|
||||||
utils.logger.error(
|
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}"
|
|
||||||
)
|
|
||||||
if not note_detail_from_html:
|
|
||||||
# 如果网页版笔记详情获取失败,则尝试API获取
|
|
||||||
note_detail_from_api: Optional[Dict] = (
|
|
||||||
await self.xhs_client.get_note_by_id(
|
|
||||||
note_id, xsec_source, xsec_token
|
|
||||||
)
|
|
||||||
)
|
|
||||||
note_detail = note_detail_from_html or note_detail_from_api
|
|
||||||
if note_detail:
|
|
||||||
note_detail.update(
|
|
||||||
{"xsec_token": xsec_token, "xsec_source": xsec_source}
|
|
||||||
)
|
|
||||||
return note_detail
|
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
utils.logger.error(
|
utils.logger.error(
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
|
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
|
||||||
@@ -364,7 +333,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def batch_get_note_comments(
|
async def batch_get_note_comments(
|
||||||
self, note_list: List[str], xsec_tokens: List[str]
|
self, note_list: List[str], xsec_tokens: List[str]
|
||||||
):
|
):
|
||||||
"""Batch get note comments"""
|
"""Batch get note comments"""
|
||||||
if not config.ENABLE_GET_COMMENTS:
|
if not config.ENABLE_GET_COMMENTS:
|
||||||
@@ -389,7 +358,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
await asyncio.gather(*task_list)
|
await asyncio.gather(*task_list)
|
||||||
|
|
||||||
async def get_comments(
|
async def get_comments(
|
||||||
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
|
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
|
||||||
):
|
):
|
||||||
"""Get note comments with keyword filtering and quantity limitation"""
|
"""Get note comments with keyword filtering and quantity limitation"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
@@ -411,7 +380,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format_proxy_info(
|
def format_proxy_info(
|
||||||
ip_proxy_info: IpInfoModel,
|
ip_proxy_info: IpInfoModel,
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||||
"""format proxy info for playwright and httpx"""
|
"""format proxy info for playwright and httpx"""
|
||||||
playwright_proxy = {
|
playwright_proxy = {
|
||||||
@@ -447,11 +416,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
return xhs_client_obj
|
return xhs_client_obj
|
||||||
|
|
||||||
async def launch_browser(
|
async def launch_browser(
|
||||||
self,
|
self,
|
||||||
chromium: BrowserType,
|
chromium: BrowserType,
|
||||||
playwright_proxy: Optional[Dict],
|
playwright_proxy: Optional[Dict],
|
||||||
user_agent: Optional[str],
|
user_agent: Optional[str],
|
||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
) -> BrowserContext:
|
) -> BrowserContext:
|
||||||
"""Launch browser and create browser context"""
|
"""Launch browser and create browser context"""
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
@@ -480,11 +449,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
return browser_context
|
return browser_context
|
||||||
|
|
||||||
async def launch_browser_with_cdp(
|
async def launch_browser_with_cdp(
|
||||||
self,
|
self,
|
||||||
playwright: Playwright,
|
playwright: Playwright,
|
||||||
playwright_proxy: Optional[Dict],
|
playwright_proxy: Optional[Dict],
|
||||||
user_agent: Optional[str],
|
user_agent: Optional[str],
|
||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
) -> BrowserContext:
|
) -> BrowserContext:
|
||||||
"""
|
"""
|
||||||
使用CDP模式启动浏览器
|
使用CDP模式启动浏览器
|
||||||
|
|||||||
Reference in New Issue
Block a user