mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-04-22 03:37:38 +08:00
feat: ip proxy expired check
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
@@ -32,14 +32,18 @@ import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from constant import zhihu as zhihu_constant
|
||||
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from proxy.proxy_mixin import ProxyRefreshMixin
|
||||
from tools import utils
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
|
||||
from .exception import DataFetchError, ForbiddenError
|
||||
from .field import SearchSort, SearchTime, SearchType
|
||||
from .help import ZhihuExtractor, sign
|
||||
|
||||
|
||||
class ZhiHuClient(AbstractApiClient):
|
||||
class ZhiHuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -49,12 +53,15 @@ class ZhiHuClient(AbstractApiClient):
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
proxy_ip_pool: Optional["ProxyIpPool"] = None,
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.default_headers = headers
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = ZhihuExtractor()
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def _pre_headers(self, url: str) -> Dict:
|
||||
"""
|
||||
@@ -85,6 +92,9 @@ class ZhiHuClient(AbstractApiClient):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 每次请求前检测代理是否过期
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
# return response.text
|
||||
return_response = kwargs.pop('return_response', False)
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||||
self._extractor = ZhihuExtractor()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
@@ -70,10 +71,10 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
"""
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
self.ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||
ip_proxy_info
|
||||
)
|
||||
@@ -411,6 +412,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
)
|
||||
return zhihu_client_obj
|
||||
|
||||
|
||||
Reference in New Issue
Block a user