From eb45a6367f86b5b13c9fa2c4fd701145b11a1efa Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 18 Mar 2026 12:21:27 +1300 Subject: [PATCH 1/4] fix: disable SSL verification for proxy/VPN environments Add verify=False to all httpx.AsyncClient calls across bilibili, weibo, zhihu clients and crawler_util. Fixes SSL certificate validation errors when running behind a corporate proxy or VPN. Co-Authored-By: Claude Sonnet 4.6 --- media_platform/bilibili/client.py | 4 ++-- media_platform/weibo/client.py | 6 +++--- media_platform/zhihu/client.py | 2 +- tools/crawler_util.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 45c927c..cf0a170 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -68,7 +68,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): # Check if proxy has expired before each request await self._refresh_proxy_if_expired() - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) try: data: Dict = response.json() @@ -222,7 +222,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_video_media(self, url: str) -> Union[bytes, None]: # Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206) - async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True) as client: + async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True, verify=False) as client: try: response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) response.raise_for_status() diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index a49a482..ce51d90 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -73,7 +73,7 @@ class WeiboClient(ProxyRefreshMixin): await self._refresh_proxy_if_expired() enable_return_response = kwargs.pop("return_response", False) - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if enable_return_response: @@ -261,7 +261,7 @@ class WeiboClient(ProxyRefreshMixin): :return: """ url = f"{self._host}/detail/{note_id}" - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) if response.status_code != 200: raise DataFetchError(f"get weibo detail err: {response.text}") @@ -291,7 +291,7 @@ class WeiboClient(ProxyRefreshMixin): # Since Weibo images are accessed through i1.wp.com, we need to concatenate the URL final_uri = (f"{self._image_agent_host}" f"{image_url}") - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: try: response = await client.request("GET", final_uri, timeout=self.timeout) response.raise_for_status() diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py index 079d08d..84721c7 100644 --- a/media_platform/zhihu/client.py +++ b/media_platform/zhihu/client.py @@ -98,7 +98,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin): # return response.text return_response = kwargs.pop('return_response', False) - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if response.status_code != 200: diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 12cb73e..980ca35 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -47,7 +47,7 @@ async def find_login_qrcode(page: Page, selector: str) -> str: ) login_qrcode_img = str(await elements.get_property("src")) # type: ignore if "http://" in login_qrcode_img or "https://" in login_qrcode_img: - async with httpx.AsyncClient(follow_redirects=True) as client: + async with httpx.AsyncClient(follow_redirects=True, verify=False) as client: utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}") resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()}) if resp.status_code == 200: From 125e02a4b9f243b02aa7ec33c08b284ffe0ade71 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 18 Mar 2026 12:31:49 +1300 Subject: [PATCH 2/4] fix: make SSL verification opt-in via config, extend fix to all platforms - Add DISABLE_SSL_VERIFY = False to base_config.py (default: verification on) - Add tools/httpx_util.py with make_async_client() factory that reads the config - Replace all httpx.AsyncClient() call sites across all platforms (bilibili, weibo, zhihu, xhs, douyin, kuaishou) and crawler_util with make_async_client() - Extends SSL fix to previously missed platforms: xhs, douyin, kuaishou Users running behind an intercepting proxy can set DISABLE_SSL_VERIFY = True in config/base_config.py. All other users retain certificate verification. Co-Authored-By: Claude Sonnet 4.6 --- config/base_config.py | 5 +++++ media_platform/bilibili/client.py | 5 +++-- media_platform/douyin/client.py | 7 ++++--- media_platform/kuaishou/client.py | 5 +++-- media_platform/weibo/client.py | 7 ++++--- media_platform/xhs/client.py | 7 ++++--- media_platform/zhihu/client.py | 3 ++- tools/crawler_util.py | 3 ++- tools/httpx_util.py | 14 ++++++++++++++ 9 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 tools/httpx_util.py diff --git a/config/base_config.py b/config/base_config.py index 26ed383..7878ebf 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -120,6 +120,11 @@ FONT_PATH = "./docs/STZHONGS.TTF" # Crawl interval CRAWLER_MAX_SLEEP_SEC = 2 +# Disable SSL certificate verification. Set to True only when running behind an intercepting +# proxy (corporate gateway, Burp Suite, mitmproxy, etc.) that injects its own certificate. +# WARNING: disabling SSL verification exposes all traffic to MITM attacks. +DISABLE_SSL_VERIFY = False + from .bilibili_config import * from .xhs_config import * from .dy_config import * diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index cf0a170..ab87dfc 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -29,6 +29,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page +from tools.httpx_util import make_async_client import config from base.base_crawler import AbstractApiClient @@ -68,7 +69,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): # Check if proxy has expired before each request await self._refresh_proxy_if_expired() - async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) try: data: Dict = response.json() @@ -222,7 +223,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_video_media(self, url: str) -> Union[bytes, None]: # Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206) - async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True, verify=False) as client: + async with make_async_client(proxy=self.proxy, follow_redirects=True) as client: try: response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) response.raise_for_status() diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index f361f6b..ce98ae7 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -29,6 +29,7 @@ from playwright.async_api import BrowserContext from base.base_crawler import AbstractApiClient from proxy.proxy_mixin import ProxyRefreshMixin from tools import utils +from tools.httpx_util import make_async_client from var import request_keyword_var if TYPE_CHECKING: @@ -116,7 +117,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin): # Check whether the proxy has expired before each request await self._refresh_proxy_if_expired() - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) try: if response.text == "" or response.text == "blocked": @@ -333,7 +334,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin): return result async def get_aweme_media(self, url: str) -> Union[bytes, None]: - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: try: response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True) response.raise_for_status() @@ -354,7 +355,7 @@ class DouYinClient(AbstractApiClient, ProxyRefreshMixin): Returns: 重定向后的完整URL """ - async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=False) as client: + async with make_async_client(proxy=self.proxy, follow_redirects=False) as client: try: utils.logger.info(f"[DouYinClient.resolve_short_url] Resolving short URL: {short_url}") response = await client.get(short_url, timeout=10) diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 785263c..6cd18cb 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -31,6 +31,7 @@ import config from base.base_crawler import AbstractApiClient from proxy.proxy_mixin import ProxyRefreshMixin from tools import utils +from tools.httpx_util import make_async_client if TYPE_CHECKING: from proxy.proxy_ip_pool import ProxyIpPool @@ -65,7 +66,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): # Check if proxy is expired before each request await self._refresh_proxy_if_expired() - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) data: Dict = response.json() if data.get("errors"): @@ -97,7 +98,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): await self._refresh_proxy_if_expired() json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request( method="POST", url=f"{self._rest_host}{uri}", diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index ce51d90..dac7de1 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -32,6 +32,7 @@ from urllib.parse import parse_qs, unquote, urlencode import httpx from httpx import Response from playwright.async_api import BrowserContext, Page +from tools.httpx_util import make_async_client from tenacity import retry, stop_after_attempt, wait_fixed import config @@ -73,7 +74,7 @@ class WeiboClient(ProxyRefreshMixin): await self._refresh_proxy_if_expired() enable_return_response = kwargs.pop("return_response", False) - async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if enable_return_response: @@ -261,7 +262,7 @@ class WeiboClient(ProxyRefreshMixin): :return: """ url = f"{self._host}/detail/{note_id}" - async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) if response.status_code != 200: raise DataFetchError(f"get weibo detail err: {response.text}") @@ -291,7 +292,7 @@ class WeiboClient(ProxyRefreshMixin): # Since Weibo images are accessed through i1.wp.com, we need to concatenate the URL final_uri = (f"{self._image_agent_host}" f"{image_url}") - async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: + async with make_async_client(proxy=self.proxy) as client: try: response = await client.request("GET", final_uri, timeout=self.timeout) response.raise_for_status() diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index aa7d8fb..104190d 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -25,6 +25,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_not_exception_type +from tools.httpx_util import make_async_client import config from base.base_crawler import AbstractApiClient @@ -127,7 +128,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): # return response.text return_response = kwargs.pop("return_response", False) - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if response.status_code == 471 or response.status_code == 461: @@ -192,7 +193,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): # Check if proxy is expired before request await self._refresh_proxy_if_expired() - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: try: response = await client.request("GET", url, timeout=self.timeout) response.raise_for_status() @@ -219,7 +220,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): """ uri = "/api/sns/web/v1/user/selfinfo" headers = await self._pre_headers(uri, params={}) - async with httpx.AsyncClient(proxy=self.proxy) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.get(f"{self._host}{uri}", headers=headers) if response.status_code == 200: return response.json() diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py index 84721c7..4f62cf1 100644 --- a/media_platform/zhihu/client.py +++ b/media_platform/zhihu/client.py @@ -26,6 +26,7 @@ from urllib.parse import urlencode import httpx from httpx import Response from playwright.async_api import BrowserContext, Page +from tools.httpx_util import make_async_client from tenacity import retry, stop_after_attempt, wait_fixed import config @@ -98,7 +99,7 @@ class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin): # return response.text return_response = kwargs.pop('return_response', False) - async with httpx.AsyncClient(proxy=self.proxy, verify=False) as client: + async with make_async_client(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if response.status_code != 200: diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 980ca35..aca38d5 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -37,6 +37,7 @@ from PIL import Image, ImageDraw, ImageShow from playwright.async_api import Cookie, Page from . import utils +from .httpx_util import make_async_client async def find_login_qrcode(page: Page, selector: str) -> str: @@ -47,7 +48,7 @@ async def find_login_qrcode(page: Page, selector: str) -> str: ) login_qrcode_img = str(await elements.get_property("src")) # type: ignore if "http://" in login_qrcode_img or "https://" in login_qrcode_img: - async with httpx.AsyncClient(follow_redirects=True, verify=False) as client: + async with make_async_client(follow_redirects=True) as client: utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}") resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()}) if resp.status_code == 200: diff --git a/tools/httpx_util.py b/tools/httpx_util.py new file mode 100644 index 0000000..7647d06 --- /dev/null +++ b/tools/httpx_util.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +import httpx +import config + + +def make_async_client(**kwargs) -> httpx.AsyncClient: + """Create an httpx.AsyncClient with project-wide defaults. + + Reads DISABLE_SSL_VERIFY from config (default False). + Set DISABLE_SSL_VERIFY = True in config/base_config.py only when running + behind an intercepting proxy (corporate gateway, Burp, mitmproxy, etc.). + """ + kwargs.setdefault("verify", not getattr(config, "DISABLE_SSL_VERIFY", False)) + return httpx.AsyncClient(**kwargs) From dd327f068eef16e8aaf18140e2f625ec08df246d Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 18 Mar 2026 12:39:35 +1300 Subject: [PATCH 3/4] fix: extend make_async_client to proxy provider and IP pool Migrate remaining httpx.AsyncClient call sites in proxy/ package to use make_async_client(), completing the DISABLE_SSL_VERIFY coverage across all outbound HTTP requests in the project. Co-Authored-By: Claude Sonnet 4.6 --- proxy/providers/jishu_http_proxy.py | 3 ++- proxy/providers/kuaidl_proxy.py | 3 ++- proxy/providers/wandou_http_proxy.py | 3 ++- proxy/proxy_ip_pool.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/proxy/providers/jishu_http_proxy.py b/proxy/providers/jishu_http_proxy.py index 7e1ecf0..03c0f11 100644 --- a/proxy/providers/jishu_http_proxy.py +++ b/proxy/providers/jishu_http_proxy.py @@ -30,6 +30,7 @@ import httpx from proxy import IpCache, IpGetError, ProxyProvider from proxy.types import IpInfoModel from tools import utils +from tools.httpx_util import make_async_client class JiSuHttpProxy(ProxyProvider): @@ -68,7 +69,7 @@ class JiSuHttpProxy(ProxyProvider): need_get_count = num - len(ip_cache_list) self.params.update({"num": need_get_count}) ip_infos = [] - async with httpx.AsyncClient() as client: + async with make_async_client() as client: url = self.api_path + "/fetchips" + '?' + urlencode(self.params) utils.logger.info(f"[JiSuHttpProxy.get_proxy] get ip proxy url:{url}") response = await client.get(url, headers={ diff --git a/proxy/providers/kuaidl_proxy.py b/proxy/providers/kuaidl_proxy.py index 874e86a..36d89dd 100644 --- a/proxy/providers/kuaidl_proxy.py +++ b/proxy/providers/kuaidl_proxy.py @@ -28,6 +28,7 @@ from typing import Dict, List import httpx from pydantic import BaseModel, Field +from tools.httpx_util import make_async_client from proxy import IpCache, IpInfoModel, ProxyProvider from proxy.types import ProviderNameEnum @@ -113,7 +114,7 @@ class KuaiDaiLiProxy(ProxyProvider): self.params.update({"num": need_get_count}) ip_infos: List[IpInfoModel] = [] - async with httpx.AsyncClient() as client: + async with make_async_client() as client: response = await client.get(self.api_base + uri, params=self.params) if response.status_code != 200: diff --git a/proxy/providers/wandou_http_proxy.py b/proxy/providers/wandou_http_proxy.py index 7895dc6..c8aa342 100644 --- a/proxy/providers/wandou_http_proxy.py +++ b/proxy/providers/wandou_http_proxy.py @@ -30,6 +30,7 @@ import httpx from proxy import IpCache, IpGetError, ProxyProvider from proxy.types import IpInfoModel from tools import utils +from tools.httpx_util import make_async_client class WanDouHttpProxy(ProxyProvider): @@ -65,7 +66,7 @@ class WanDouHttpProxy(ProxyProvider): need_get_count = num - len(ip_cache_list) self.params.update({"num": min(need_get_count, 100)}) # Maximum 100 ip_infos = [] - async with httpx.AsyncClient() as client: + async with make_async_client() as client: url = self.api_path + "?" + urlencode(self.params) utils.logger.info(f"[WanDouHttpProxy.get_proxy] get ip proxy url:{url}") response = await client.get( diff --git a/proxy/proxy_ip_pool.py b/proxy/proxy_ip_pool.py index 942b0e0..8f4a129 100644 --- a/proxy/proxy_ip_pool.py +++ b/proxy/proxy_ip_pool.py @@ -26,6 +26,7 @@ from typing import Dict, List import httpx from tenacity import retry, stop_after_attempt, wait_fixed +from tools.httpx_util import make_async_client import config from proxy.providers import ( @@ -81,7 +82,7 @@ class ProxyIpPool: else: proxy_url = f"http://{proxy.ip}:{proxy.port}" - async with httpx.AsyncClient(proxy=proxy_url) as client: + async with make_async_client(proxy=proxy_url) as client: response = await client.get(self.valid_ip_url) if response.status_code == 200: return True From 2970488f404149896e4ca0befa89e32feae8a9ad Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 18 Mar 2026 12:44:37 +1300 Subject: [PATCH 4/4] =?UTF-8?q?docs:=20=E5=B0=86=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E6=B3=A8=E9=87=8A=E5=92=8C=E6=96=87=E6=A1=A3=E6=94=B9=E4=B8=BA?= =?UTF-8?q?=E4=B8=AD=E6=96=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- config/base_config.py | 5 ++--- tools/httpx_util.py | 7 +++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index 7878ebf..83571e0 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -120,9 +120,8 @@ FONT_PATH = "./docs/STZHONGS.TTF" # Crawl interval CRAWLER_MAX_SLEEP_SEC = 2 -# Disable SSL certificate verification. Set to True only when running behind an intercepting -# proxy (corporate gateway, Burp Suite, mitmproxy, etc.) that injects its own certificate. -# WARNING: disabling SSL verification exposes all traffic to MITM attacks. +# 是否禁用 SSL 证书验证。仅在使用企业代理、Burp Suite、mitmproxy 等会注入自签名证书的中间人代理时设为 True。 +# 警告:禁用 SSL 验证将使所有流量暴露于中间人攻击风险,请勿在生产环境中开启。 DISABLE_SSL_VERIFY = False from .bilibili_config import * diff --git a/tools/httpx_util.py b/tools/httpx_util.py index 7647d06..7d32ea6 100644 --- a/tools/httpx_util.py +++ b/tools/httpx_util.py @@ -4,11 +4,10 @@ import config def make_async_client(**kwargs) -> httpx.AsyncClient: - """Create an httpx.AsyncClient with project-wide defaults. + """创建统一配置的 httpx.AsyncClient。 - Reads DISABLE_SSL_VERIFY from config (default False). - Set DISABLE_SSL_VERIFY = True in config/base_config.py only when running - behind an intercepting proxy (corporate gateway, Burp, mitmproxy, etc.). + 从配置文件读取 DISABLE_SSL_VERIFY(默认 False,即开启 SSL 验证)。 + 仅在使用企业代理、Burp、mitmproxy 等中间人代理时才需将其设为 True。 """ kwargs.setdefault("verify", not getattr(config, "DISABLE_SSL_VERIFY", False)) return httpx.AsyncClient(**kwargs)