mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 11:27:26 +08:00
feat: ip proxy expired check
This commit is contained in:
@@ -26,7 +26,7 @@ import asyncio
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import parse_qs, unquote, urlencode
|
||||
|
||||
import httpx
|
||||
@@ -35,13 +35,17 @@ from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from proxy.proxy_mixin import ProxyRefreshMixin
|
||||
from tools import utils
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
|
||||
|
||||
class WeiboClient:
|
||||
class WeiboClient(ProxyRefreshMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -51,6 +55,7 @@ class WeiboClient:
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
proxy_ip_pool: Optional["ProxyIpPool"] = None,
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
@@ -59,9 +64,14 @@ class WeiboClient:
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
# 初始化代理池(来自 ProxyRefreshMixin)
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
@retry(stop=stop_after_attempt(5), wait=wait_fixed(3))
|
||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||
# 每次请求前检测代理是否过期
|
||||
await self._refresh_proxy_if_expired()
|
||||
|
||||
enable_return_response = kwargs.pop("return_response", False)
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
@@ -63,12 +63,13 @@ class WeiboCrawler(AbstractCrawler):
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
self.ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
@@ -334,6 +335,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
|
||||
)
|
||||
return weibo_client_obj
|
||||
|
||||
|
||||
Reference in New Issue
Block a user