mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 03:17:25 +08:00
fix: make SSL verification opt-in via config, extend fix to all platforms
- Add DISABLE_SSL_VERIFY = False to base_config.py (default: verification on) - Add tools/httpx_util.py with make_async_client() factory that reads the config - Replace all httpx.AsyncClient() call sites across all platforms (bilibili, weibo, zhihu, xhs, douyin, kuaishou) and crawler_util with make_async_client() - Extends SSL fix to previously missed platforms: xhs, douyin, kuaishou Users running behind an intercepting proxy can set DISABLE_SSL_VERIFY = True in config/base_config.py. All other users retain certificate verification. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -120,6 +120,11 @@ FONT_PATH = "./docs/STZHONGS.TTF"
|
|||||||
# Crawl interval
|
# Crawl interval
|
||||||
CRAWLER_MAX_SLEEP_SEC = 2
|
CRAWLER_MAX_SLEEP_SEC = 2
|
||||||
|
|
||||||
|
# Disable SSL certificate verification. Set to True only when running behind an intercepting
|
||||||
|
# proxy (corporate gateway, Burp Suite, mitmproxy, etc.) that injects its own certificate.
|
||||||
|
# WARNING: disabling SSL verification exposes all traffic to MITM attacks.
|
||||||
|
DISABLE_SSL_VERIFY = False
|
||||||
|
|
||||||
from .bilibili_config import *
|
from .bilibili_config import *
|
||||||
from .xhs_config import *
|
from .xhs_config import *
|
||||||
from .dy_config import *
|
from .dy_config import *
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from urllib.parse import urlencode
|
|||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
|
from tools.httpx_util import make_async_client
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
@@ -68,7 +69,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
# Check if proxy has expired before each request
|
# Check if proxy has expired before each request
|
||||||
await self._refresh_proxy_if_expired()
|
await self._refresh_proxy_if_expired()
|
||||||
|
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
try:
|
try:
|
||||||
data: Dict = response.json()
|
data: Dict = response.json()
|
||||||
@@ -222,7 +223,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
|
|
||||||
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
||||||
# Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206)
|
# Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206)
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True, verify=False) as client:
|
async with make_async_client(proxy=self.proxy, follow_redirects=True) as client:
|
||||||
try:
|
try:
|
||||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from playwright.async_api import BrowserContext
|
|||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
from proxy.proxy_mixin import ProxyRefreshMixin
|
from proxy.proxy_mixin import ProxyRefreshMixin
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
from tools.httpx_util import make_async_client
|
||||||
from var import request_keyword_var
|
from var import request_keyword_var
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -116,7 +117,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
# Check whether the proxy has expired before each request
|
# Check whether the proxy has expired before each request
|
||||||
await self._refresh_proxy_if_expired()
|
await self._refresh_proxy_if_expired()
|
||||||
|
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
try:
|
try:
|
||||||
if response.text == "" or response.text == "blocked":
|
if response.text == "" or response.text == "blocked":
|
||||||
@@ -333,7 +334,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
try:
|
try:
|
||||||
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -354,7 +355,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
Returns:
|
Returns:
|
||||||
重定向后的完整URL
|
重定向后的完整URL
|
||||||
"""
|
"""
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=False) as client:
|
async with make_async_client(proxy=self.proxy, follow_redirects=False) as client:
|
||||||
try:
|
try:
|
||||||
utils.logger.info(f"[DouYinClient.resolve_short_url] Resolving short URL: {short_url}")
|
utils.logger.info(f"[DouYinClient.resolve_short_url] Resolving short URL: {short_url}")
|
||||||
response = await client.get(short_url, timeout=10)
|
response = await client.get(short_url, timeout=10)
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ import config
|
|||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
from proxy.proxy_mixin import ProxyRefreshMixin
|
from proxy.proxy_mixin import ProxyRefreshMixin
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
from tools.httpx_util import make_async_client
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from proxy.proxy_ip_pool import ProxyIpPool
|
from proxy.proxy_ip_pool import ProxyIpPool
|
||||||
@@ -65,7 +66,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
# Check if proxy is expired before each request
|
# Check if proxy is expired before each request
|
||||||
await self._refresh_proxy_if_expired()
|
await self._refresh_proxy_if_expired()
|
||||||
|
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
data: Dict = response.json()
|
data: Dict = response.json()
|
||||||
if data.get("errors"):
|
if data.get("errors"):
|
||||||
@@ -97,7 +98,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
await self._refresh_proxy_if_expired()
|
await self._refresh_proxy_if_expired()
|
||||||
|
|
||||||
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(
|
response = await client.request(
|
||||||
method="POST",
|
method="POST",
|
||||||
url=f"{self._rest_host}{uri}",
|
url=f"{self._rest_host}{uri}",
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ from urllib.parse import parse_qs, unquote, urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from httpx import Response
|
from httpx import Response
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
|
from tools.httpx_util import make_async_client
|
||||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||||
|
|
||||||
import config
|
import config
|
||||||
@@ -73,7 +74,7 @@ class WeiboClient(ProxyRefreshMixin):
|
|||||||
await self._refresh_proxy_if_expired()
|
await self._refresh_proxy_if_expired()
|
||||||
|
|
||||||
enable_return_response = kwargs.pop("return_response", False)
|
enable_return_response = kwargs.pop("return_response", False)
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
|
|
||||||
if enable_return_response:
|
if enable_return_response:
|
||||||
@@ -261,7 +262,7 @@ class WeiboClient(ProxyRefreshMixin):
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
url = f"{self._host}/detail/{note_id}"
|
url = f"{self._host}/detail/{note_id}"
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise DataFetchError(f"get weibo detail err: {response.text}")
|
raise DataFetchError(f"get weibo detail err: {response.text}")
|
||||||
@@ -291,7 +292,7 @@ class WeiboClient(ProxyRefreshMixin):
|
|||||||
# Since Weibo images are accessed through i1.wp.com, we need to concatenate the URL
|
# Since Weibo images are accessed through i1.wp.com, we need to concatenate the URL
|
||||||
final_uri = (f"{self._image_agent_host}"
|
final_uri = (f"{self._image_agent_host}"
|
||||||
f"{image_url}")
|
f"{image_url}")
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
try:
|
try:
|
||||||
response = await client.request("GET", final_uri, timeout=self.timeout)
|
response = await client.request("GET", final_uri, timeout=self.timeout)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ from urllib.parse import urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_not_exception_type
|
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_not_exception_type
|
||||||
|
from tools.httpx_util import make_async_client
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
@@ -127,7 +128,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
|
|
||||||
# return response.text
|
# return response.text
|
||||||
return_response = kwargs.pop("return_response", False)
|
return_response = kwargs.pop("return_response", False)
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
|
|
||||||
if response.status_code == 471 or response.status_code == 461:
|
if response.status_code == 471 or response.status_code == 461:
|
||||||
@@ -192,7 +193,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
# Check if proxy is expired before request
|
# Check if proxy is expired before request
|
||||||
await self._refresh_proxy_if_expired()
|
await self._refresh_proxy_if_expired()
|
||||||
|
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
try:
|
try:
|
||||||
response = await client.request("GET", url, timeout=self.timeout)
|
response = await client.request("GET", url, timeout=self.timeout)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -219,7 +220,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
"""
|
"""
|
||||||
uri = "/api/sns/web/v1/user/selfinfo"
|
uri = "/api/sns/web/v1/user/selfinfo"
|
||||||
headers = await self._pre_headers(uri, params={})
|
headers = await self._pre_headers(uri, params={})
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.get(f"{self._host}{uri}", headers=headers)
|
response = await client.get(f"{self._host}{uri}", headers=headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ from urllib.parse import urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from httpx import Response
|
from httpx import Response
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
|
from tools.httpx_util import make_async_client
|
||||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||||
|
|
||||||
import config
|
import config
|
||||||
@@ -98,7 +99,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
|||||||
# return response.text
|
# return response.text
|
||||||
return_response = kwargs.pop('return_response', False)
|
return_response = kwargs.pop('return_response', False)
|
||||||
|
|
||||||
async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client:
|
async with make_async_client(proxy=self.proxy) as client:
|
||||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ from PIL import Image, ImageDraw, ImageShow
|
|||||||
from playwright.async_api import Cookie, Page
|
from playwright.async_api import Cookie, Page
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
from .httpx_util import make_async_client
|
||||||
|
|
||||||
|
|
||||||
async def find_login_qrcode(page: Page, selector: str) -> str:
|
async def find_login_qrcode(page: Page, selector: str) -> str:
|
||||||
@@ -47,7 +48,7 @@ async def find_login_qrcode(page: Page, selector: str) -> str:
|
|||||||
)
|
)
|
||||||
login_qrcode_img = str(await elements.get_property("src")) # type: ignore
|
login_qrcode_img = str(await elements.get_property("src")) # type: ignore
|
||||||
if "http://" in login_qrcode_img or "https://" in login_qrcode_img:
|
if "http://" in login_qrcode_img or "https://" in login_qrcode_img:
|
||||||
async with httpx.AsyncClient(follow_redirects=True, verify=False) as client:
|
async with make_async_client(follow_redirects=True) as client:
|
||||||
utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}")
|
utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}")
|
||||||
resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()})
|
resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()})
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
|
|||||||
14
tools/httpx_util.py
Normal file
14
tools/httpx_util.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import httpx
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
|
def make_async_client(**kwargs) -> httpx.AsyncClient:
|
||||||
|
"""Create an httpx.AsyncClient with project-wide defaults.
|
||||||
|
|
||||||
|
Reads DISABLE_SSL_VERIFY from config (default False).
|
||||||
|
Set DISABLE_SSL_VERIFY = True in config/base_config.py only when running
|
||||||
|
behind an intercepting proxy (corporate gateway, Burp, mitmproxy, etc.).
|
||||||
|
"""
|
||||||
|
kwargs.setdefault("verify", not getattr(config, "DISABLE_SSL_VERIFY", False))
|
||||||
|
return httpx.AsyncClient(**kwargs)
|
||||||
Reference in New Issue
Block a user