refactor: config update

This commit is contained in:
程序员阿江(Relakkes)
2025-07-18 23:26:52 +08:00
parent 122978b35c
commit 13b00f7a36
17 changed files with 964 additions and 485 deletions

View File

@@ -1,24 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from config import *
# 小红书平台配置
SORT_TYPE = "popularity_descending"
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
CRAWLER_MAX_SLEEP_SEC = 2
XHS_SPECIFIED_NOTE_URL_LIST = [
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
# ........................
]
XHS_CREATOR_ID_LIST = [
"63e36c9a000000002703502b",
# ........................
]

View File

@@ -16,10 +16,16 @@ import time
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from playwright.async_api import BrowserContext, BrowserType, Page, Playwright, async_playwright
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
from tenacity import RetryError
from . import config
import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
from model.m_xiaohongshu import NoteUrlInfo
@@ -45,7 +51,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
def __init__(self) -> None:
self.index_url = "https://www.xiaohongshu.com"
# self.user_agent = utils.get_user_agent()
self.user_agent = config.UA if config.UA else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
self.user_agent = (
config.UA
if config.UA
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
)
self.cdp_manager = None
async def start(self) -> None:
@@ -64,15 +74,20 @@ class XiaoHongShuCrawler(AbstractCrawler):
if config.ENABLE_CDP_MODE:
utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright, playwright_proxy_format, self.user_agent,
headless=config.CDP_HEADLESS
playwright,
playwright_proxy_format,
self.user_agent,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium, playwright_proxy_format, self.user_agent, headless=config.HEADLESS
chromium,
playwright_proxy_format,
self.user_agent,
headless=config.HEADLESS,
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
@@ -304,7 +319,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
try:
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
utils.logger.info(
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
)
# 尝试直接获取网页版笔记详情携带cookie
note_detail_from_html: Optional[Dict] = (
await self.xhs_client.get_note_by_id_from_html(
@@ -462,8 +479,13 @@ class XiaoHongShuCrawler(AbstractCrawler):
)
return browser_context
async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict],
user_agent: Optional[str], headless: bool = True) -> BrowserContext:
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
@@ -473,7 +495,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless
headless=headless,
)
# 显示浏览器信息
@@ -483,10 +505,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
return browser_context
except Exception as e:
utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败回退到标准模式: {e}")
utils.logger.error(
f"[XiaoHongShuCrawler] CDP模式启动失败回退到标准模式: {e}"
)
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
return await self.launch_browser(
chromium, playwright_proxy, user_agent, headless
)
async def close(self):
"""Close browser context"""