Merge pull request #680 from korruz/main

refactor: move format_proxy_info to utils and update crawler classes
This commit is contained in:
程序员阿江-Relakkes
2025-07-29 14:21:48 +08:00
committed by GitHub
8 changed files with 10 additions and 103 deletions

View File

@@ -63,7 +63,7 @@ class BilibiliCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
@@ -547,25 +547,6 @@ class BilibiliCrawler(AbstractCrawler):
)
return bilibili_client_obj
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""
format proxy info for playwright and httpx
:param ip_proxy_info: ip proxy info
:return: playwright proxy, httpx proxy
"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def launch_browser(
self,
chromium: BrowserType,

View File

@@ -54,7 +54,7 @@ class DouYinCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
@@ -276,21 +276,6 @@ class DouYinCrawler(AbstractCrawler):
if aweme_item is not None:
await douyin_store.update_douyin_aweme(aweme_item)
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore

View File

@@ -55,7 +55,7 @@ class KuaishouCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
@@ -259,21 +259,6 @@ class KuaishouCrawler(AbstractCrawler):
browser_context=self.browser_context
)
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
"""Create ks client"""
utils.logger.info(

View File

@@ -30,7 +30,6 @@ from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import tieba as tieba_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from tools.crawler_util import format_proxy_info
from var import crawler_type_var, source_keyword_var
from .client import BaiduTieBaClient
@@ -66,7 +65,7 @@ class TieBaCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
_, httpx_proxy_format = format_proxy_info(ip_proxy_info)
_, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
utils.logger.info(
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
)

View File

@@ -64,7 +64,7 @@ class WeiboCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
@@ -367,21 +367,6 @@ class WeiboCrawler(AbstractCrawler):
)
return weibo_client_obj
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def launch_browser(
self,
chromium: BrowserType,

View File

@@ -61,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
@@ -378,21 +378,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
"""Create xhs client"""
utils.logger.info(

View File

@@ -65,7 +65,7 @@ class ZhihuCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
@@ -351,21 +351,6 @@ class ZhihuCrawler(AbstractCrawler):
await self.batch_get_content_comments(need_get_comment_notes)
@staticmethod
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient:
"""Create zhihu client"""
utils.logger.info(

View File

@@ -27,6 +27,8 @@ import httpx
from PIL import Image, ImageDraw
from playwright.async_api import Cookie, Page
from proxy.proxy_ip_pool import IpInfoModel
from . import utils
@@ -171,7 +173,7 @@ def match_interact_info_count(count_str: str) -> int:
return 0
def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[Dict]]:
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",