mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 03:17:25 +08:00
refactor: move format_proxy_info to utils and update crawler classes to use it
This commit is contained in:
@@ -63,7 +63,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
|
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||||
ip_proxy_info
|
ip_proxy_info
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -547,25 +547,6 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
)
|
)
|
||||||
return bilibili_client_obj
|
return bilibili_client_obj
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def format_proxy_info(
|
|
||||||
ip_proxy_info: IpInfoModel,
|
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
||||||
"""
|
|
||||||
format proxy info for playwright and httpx
|
|
||||||
:param ip_proxy_info: ip proxy info
|
|
||||||
:return: playwright proxy, httpx proxy
|
|
||||||
"""
|
|
||||||
playwright_proxy = {
|
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
|
||||||
"username": ip_proxy_info.user,
|
|
||||||
"password": ip_proxy_info.password,
|
|
||||||
}
|
|
||||||
httpx_proxy = {
|
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
|
||||||
}
|
|
||||||
return playwright_proxy, httpx_proxy
|
|
||||||
|
|
||||||
async def launch_browser(
|
async def launch_browser(
|
||||||
self,
|
self,
|
||||||
chromium: BrowserType,
|
chromium: BrowserType,
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
|
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||||
ip_proxy_info
|
ip_proxy_info
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -276,21 +276,6 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
if aweme_item is not None:
|
if aweme_item is not None:
|
||||||
await douyin_store.update_douyin_aweme(aweme_item)
|
await douyin_store.update_douyin_aweme(aweme_item)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def format_proxy_info(
|
|
||||||
ip_proxy_info: IpInfoModel,
|
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
||||||
"""format proxy info for playwright and httpx"""
|
|
||||||
playwright_proxy = {
|
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
|
||||||
"username": ip_proxy_info.user,
|
|
||||||
"password": ip_proxy_info.password,
|
|
||||||
}
|
|
||||||
httpx_proxy = {
|
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
|
||||||
}
|
|
||||||
return playwright_proxy, httpx_proxy
|
|
||||||
|
|
||||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
|
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
|
||||||
"""Create douyin client"""
|
"""Create douyin client"""
|
||||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
|
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||||
ip_proxy_info
|
ip_proxy_info
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -259,21 +259,6 @@ class KuaishouCrawler(AbstractCrawler):
|
|||||||
browser_context=self.browser_context
|
browser_context=self.browser_context
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def format_proxy_info(
|
|
||||||
ip_proxy_info: IpInfoModel,
|
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
||||||
"""format proxy info for playwright and httpx"""
|
|
||||||
playwright_proxy = {
|
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
|
||||||
"username": ip_proxy_info.user,
|
|
||||||
"password": ip_proxy_info.password,
|
|
||||||
}
|
|
||||||
httpx_proxy = {
|
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
|
||||||
}
|
|
||||||
return playwright_proxy, httpx_proxy
|
|
||||||
|
|
||||||
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
||||||
"""Create ks client"""
|
"""Create ks client"""
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
|||||||
from store import tieba as tieba_store
|
from store import tieba as tieba_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from tools.cdp_browser import CDPBrowserManager
|
from tools.cdp_browser import CDPBrowserManager
|
||||||
from tools.crawler_util import format_proxy_info
|
|
||||||
from var import crawler_type_var, source_keyword_var
|
from var import crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import BaiduTieBaClient
|
from .client import BaiduTieBaClient
|
||||||
@@ -66,7 +65,7 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
_, httpx_proxy_format = format_proxy_info(ip_proxy_info)
|
_, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
|
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
|
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||||
ip_proxy_info
|
ip_proxy_info
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -367,21 +367,6 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
)
|
)
|
||||||
return weibo_client_obj
|
return weibo_client_obj
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def format_proxy_info(
|
|
||||||
ip_proxy_info: IpInfoModel,
|
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
||||||
"""format proxy info for playwright and httpx"""
|
|
||||||
playwright_proxy = {
|
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
|
||||||
"username": ip_proxy_info.user,
|
|
||||||
"password": ip_proxy_info.password,
|
|
||||||
}
|
|
||||||
httpx_proxy = {
|
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
|
||||||
}
|
|
||||||
return playwright_proxy, httpx_proxy
|
|
||||||
|
|
||||||
async def launch_browser(
|
async def launch_browser(
|
||||||
self,
|
self,
|
||||||
chromium: BrowserType,
|
chromium: BrowserType,
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
|
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||||
ip_proxy_info
|
ip_proxy_info
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -378,21 +378,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def format_proxy_info(
|
|
||||||
ip_proxy_info: IpInfoModel,
|
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
||||||
"""format proxy info for playwright and httpx"""
|
|
||||||
playwright_proxy = {
|
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
|
||||||
"username": ip_proxy_info.user,
|
|
||||||
"password": ip_proxy_info.password,
|
|
||||||
}
|
|
||||||
httpx_proxy = {
|
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
|
||||||
}
|
|
||||||
return playwright_proxy, httpx_proxy
|
|
||||||
|
|
||||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||||
"""Create xhs client"""
|
"""Create xhs client"""
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ class ZhihuCrawler(AbstractCrawler):
|
|||||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||||
)
|
)
|
||||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||||
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
|
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||||
ip_proxy_info
|
ip_proxy_info
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -351,21 +351,6 @@ class ZhihuCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
await self.batch_get_content_comments(need_get_comment_notes)
|
await self.batch_get_content_comments(need_get_comment_notes)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def format_proxy_info(
|
|
||||||
ip_proxy_info: IpInfoModel,
|
|
||||||
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
||||||
"""format proxy info for playwright and httpx"""
|
|
||||||
playwright_proxy = {
|
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
|
||||||
"username": ip_proxy_info.user,
|
|
||||||
"password": ip_proxy_info.password,
|
|
||||||
}
|
|
||||||
httpx_proxy = {
|
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
|
||||||
}
|
|
||||||
return playwright_proxy, httpx_proxy
|
|
||||||
|
|
||||||
async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient:
|
async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient:
|
||||||
"""Create zhihu client"""
|
"""Create zhihu client"""
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
|
|||||||
@@ -27,6 +27,8 @@ import httpx
|
|||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from playwright.async_api import Cookie, Page
|
from playwright.async_api import Cookie, Page
|
||||||
|
|
||||||
|
from proxy.proxy_ip_pool import IpInfoModel
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
|
|
||||||
@@ -171,7 +173,7 @@ def match_interact_info_count(count_str: str) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[Dict]]:
|
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||||
"""format proxy info for playwright and httpx"""
|
"""format proxy info for playwright and httpx"""
|
||||||
playwright_proxy = {
|
playwright_proxy = {
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
||||||
|
|||||||
Reference in New Issue
Block a user