diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 2b5d1bb..7ace00d 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -35,13 +35,13 @@ class BilibiliClient(AbstractApiClient): def __init__( self, timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间 - proxies=None, + proxy=None, *, headers: Dict[str, str], playwright_page: Page, cookie_dict: Dict[str, str], ): - self.proxies = proxies + self.proxy = proxy self.timeout = timeout self.headers = headers self._host = "https://api.bilibili.com" @@ -49,7 +49,7 @@ class BilibiliClient(AbstractApiClient): self.cookie_dict = cookie_dict async def request(self, method, url, **kwargs) -> Any: - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) try: data: Dict = response.json() @@ -201,7 +201,7 @@ class BilibiliClient(AbstractApiClient): return await self.get(uri, params, enable_params_sign=True) async def get_video_media(self, url: str) -> Union[bytes, None]: - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) if not response.reason_phrase == "OK": utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}") diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index f025f61..2c5ed91 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -417,7 +417,7 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) bilibili_client_obj = BilibiliClient( - proxies=httpx_proxy, + proxy=httpx_proxy, headers={ "User-Agent": self.user_agent, "Cookie": cookie_str, diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index a788e97..4a3770a 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -14,7 +14,7 @@ import json import urllib.parse from typing import Any, Callable, Dict, Union, Optional -import requests +import httpx from playwright.async_api import BrowserContext from base.base_crawler import AbstractApiClient @@ -31,13 +31,13 @@ class DouYinClient(AbstractApiClient): def __init__( self, timeout=30, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间 - proxies=None, + proxy=None, *, headers: Dict, playwright_page: Optional[Page], cookie_dict: Dict, ): - self.proxies = proxies + self.proxy = proxy self.timeout = timeout self.headers = headers self._host = "https://www.douyin.com" @@ -95,7 +95,8 @@ class DouYinClient(AbstractApiClient): params["a_bogus"] = a_bogus async def request(self, method, url, **kwargs): - response = requests.request(method, url, timeout=self.timeout, **kwargs) + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) try: if response.text == "" or response.text == "blocked": utils.logger.error(f"request params incrr, response.text: {response.text}") @@ -311,7 +312,7 @@ class DouYinClient(AbstractApiClient): return result async def get_aweme_media(self, url: str) -> Union[bytes, None]: - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True) if not response.reason_phrase == "OK": utils.logger.error(f"[DouYinCrawler.get_aweme_media] request {url} err, res:{response.text}") diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 282c746..dc50aaf 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -237,7 +237,7 @@ class DouYinCrawler(AbstractCrawler): """Create douyin client""" cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore douyin_client = DouYinClient( - proxies=httpx_proxy, + proxy=httpx_proxy, headers={ "User-Agent": await self.context_page.evaluate("() => navigator.userAgent"), "Cookie": cookie_str, diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index a3fd0db..11401ed 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -30,13 +30,13 @@ class KuaiShouClient(AbstractApiClient): def __init__( self, timeout=10, - proxies=None, + proxy=None, *, headers: Dict[str, str], playwright_page: Page, cookie_dict: Dict[str, str], ): - self.proxies = proxies + self.proxy = proxy self.timeout = timeout self.headers = headers self._host = "https://www.kuaishou.com/graphql" @@ -45,7 +45,7 @@ class KuaiShouClient(AbstractApiClient): self.graphql = KuaiShouGraphQL() async def request(self, method, url, **kwargs) -> Any: - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) data: Dict = response.json() if data.get("errors"): diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 0debef5..4ae1d63 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -268,7 +268,7 @@ class KuaishouCrawler(AbstractCrawler): await self.browser_context.cookies() ) ks_client_obj = KuaiShouClient( - proxies=httpx_proxy, + proxy=httpx_proxy, headers={ "User-Agent": self.user_agent, "Cookie": cookie_str, diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 39e0d2d..1b8c463 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,13 +1,12 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 - +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 import asyncio import json @@ -29,11 +28,12 @@ from .help import TieBaExtractor class BaiduTieBaClient(AbstractApiClient): + def __init__( - self, - timeout=10, - ip_pool=None, - default_ip_proxy=None, + self, + timeout=10, + ip_pool=None, + default_ip_proxy=None, ): self.ip_pool: Optional[ProxyIpPool] = ip_pool self.timeout = timeout @@ -46,7 +46,7 @@ class BaiduTieBaClient(AbstractApiClient): self.default_ip_proxy = default_ip_proxy @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) - async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]: + async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]: """ 封装httpx的公共请求方法,对请求响应做一些处理 Args: @@ -59,12 +59,9 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - actual_proxies = proxies if proxies else self.default_ip_proxy - async with httpx.AsyncClient(proxies=actual_proxies) as client: - response = await client.request( - method, url, timeout=self.timeout, - headers=self.headers, **kwargs - ) + actual_proxy = proxy if proxy else self.default_ip_proxy + async with httpx.AsyncClient(proxy=actual_proxy) as client: + response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs) if response.status_code != 200: utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") @@ -96,19 +93,14 @@ class BaiduTieBaClient(AbstractApiClient): final_uri = (f"{uri}?" f"{urlencode(params)}") try: - res = await self.request(method="GET", url=f"{self._host}{final_uri}", - return_ori_content=return_ori_content, - **kwargs) + res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, **kwargs) return res except RetryError as e: if self.ip_pool: proxie_model = await self.ip_pool.get_proxy() - _, proxies = utils.format_proxy_info(proxie_model) - res = await self.request(method="GET", url=f"{self._host}{final_uri}", - return_ori_content=return_ori_content, - proxies=proxies, - **kwargs) - self.default_ip_proxy = proxies + _, proxy = utils.format_proxy_info(proxie_model) + res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, proxy=proxy, **kwargs) + self.default_ip_proxy = proxy return res utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") @@ -125,8 +117,7 @@ class BaiduTieBaClient(AbstractApiClient): """ json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) - return await self.request(method="POST", url=f"{self._host}{uri}", - data=json_str, **kwargs) + return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs) async def pong(self) -> bool: """ @@ -161,11 +152,12 @@ class BaiduTieBaClient(AbstractApiClient): pass async def get_notes_by_keyword( - self, keyword: str, - page: int = 1, - page_size: int = 10, - sort: SearchSortType = SearchSortType.TIME_DESC, - note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, + self, + keyword: str, + page: int = 1, + page_size: int = 10, + sort: SearchSortType = SearchSortType.TIME_DESC, + note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, ) -> List[TiebaNote]: """ 根据关键词搜索贴吧帖子 @@ -185,7 +177,7 @@ class BaiduTieBaClient(AbstractApiClient): "rn": page_size, "pn": page, "sm": sort.value, - "only_thread": note_type.value + "only_thread": note_type.value, } page_content = await self.get(uri, params=params, return_ori_content=True) return self._page_extractor.extract_search_note_list(page_content) @@ -203,10 +195,13 @@ class BaiduTieBaClient(AbstractApiClient): page_content = await self.get(uri, return_ori_content=True) return self._page_extractor.extract_note_detail(page_content) - async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0, - callback: Optional[Callable] = None, - max_count: int = 10, - ) -> List[TiebaComment]: + async def get_note_all_comments( + self, + note_detail: TiebaNote, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 10, + ) -> List[TiebaComment]: """ 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: @@ -222,11 +217,10 @@ class BaiduTieBaClient(AbstractApiClient): current_page = 1 while note_detail.total_replay_page >= current_page and len(result) < max_count: params = { - "pn": current_page + "pn": current_page, } page_content = await self.get(uri, params=params, return_ori_content=True) - comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, - note_id=note_detail.note_id) + comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id) if not comments: break if len(result) + len(comments) > max_count: @@ -240,8 +234,12 @@ class BaiduTieBaClient(AbstractApiClient): current_page += 1 return result - async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[TiebaComment]: + async def get_comments_all_sub_comments( + self, + comments: List[TiebaComment], + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[TiebaComment]: """ 获取指定评论下的所有子评论 Args: @@ -275,8 +273,7 @@ class BaiduTieBaClient(AbstractApiClient): "pn": current_page # 页码 } page_content = await self.get(uri, params=params, return_ori_content=True) - sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, - parent_comment=parment_comment) + sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment) if not sub_comments: break @@ -328,17 +325,18 @@ class BaiduTieBaClient(AbstractApiClient): "un": user_name, "pn": page_number, "id": "utf-8", - "_": utils.get_current_timestamp() + "_": utils.get_current_timestamp(), } return await self.get(uri, params=params) - async def get_all_notes_by_creator_user_name(self, - user_name: str, crawl_interval: float = 1.0, - callback: Optional[Callable] = None, - max_note_count: int = 0, - creator_page_html_content: str = None, - ) -> List[TiebaNote]: - + async def get_all_notes_by_creator_user_name( + self, + user_name: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_note_count: int = 0, + creator_page_html_content: str = None, + ) -> List[TiebaNote]: """ 根据创作者用户名获取创作者所有帖子 Args: @@ -354,17 +352,9 @@ class BaiduTieBaClient(AbstractApiClient): # 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到 result: List[TiebaNote] = [] if creator_page_html_content: - thread_id_list = ( - self._page_extractor.extract_tieba_thread_id_list_from_creator_page( - creator_page_html_content - ) - ) - utils.logger.info( - f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}" - ) - note_detail_task = [ - self.get_note_by_id(thread_id) for thread_id in thread_id_list - ] + thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content)) + utils.logger.info(f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}") + note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list] notes = await asyncio.gather(*note_detail_task) if callback: await callback(notes) @@ -377,14 +367,12 @@ class BaiduTieBaClient(AbstractApiClient): while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count): notes_res = await self.get_notes_by_creator(user_name, page_number) if not notes_res or notes_res.get("no") != 0: - utils.logger.error( - f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}") + utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}") break notes_data = notes_res.get("data") notes_has_more = notes_data.get("has_more") notes = notes_data["thread_list"] - utils.logger.info( - f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}") + utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}") note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes] notes = await asyncio.gather(*note_detail_task) diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index 92906b3..a1404d1 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -36,13 +36,13 @@ class WeiboClient: def __init__( self, timeout=30, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间 - proxies=None, + proxy=None, *, headers: Dict[str, str], playwright_page: Page, cookie_dict: Dict[str, str], ): - self.proxies = proxies + self.proxy = proxy self.timeout = timeout self.headers = headers self._host = "https://m.weibo.cn" @@ -52,7 +52,7 @@ class WeiboClient: async def request(self, method, url, **kwargs) -> Union[Response, Dict]: enable_return_response = kwargs.pop("return_response", False) - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if enable_return_response: @@ -217,7 +217,7 @@ class WeiboClient: :return: """ url = f"{self._host}/detail/{note_id}" - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) if response.status_code != 200: raise DataFetchError(f"get weibo detail err: {response.text}") @@ -247,7 +247,7 @@ class WeiboClient: # 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下 final_uri = (f"{self._image_agent_host}" f"{image_url}") - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request("GET", final_uri, timeout=self.timeout) if not response.reason_phrase == "OK": utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}") diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index ed305cc..f789d33 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -289,7 +289,7 @@ class WeiboCrawler(AbstractCrawler): utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) weibo_client_obj = WeiboClient( - proxies=httpx_proxy, + proxy=httpx_proxy, headers={ "User-Agent": utils.get_mobile_user_agent(), "Cookie": cookie_str, diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 400962d..830c1d6 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -33,13 +33,13 @@ class XiaoHongShuClient(AbstractApiClient): def __init__( self, timeout=30, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间 - proxies=None, + proxy=None, *, headers: Dict[str, str], playwright_page: Page, cookie_dict: Dict[str, str], ): - self.proxies = proxies + self.proxy = proxy self.timeout = timeout self.headers = headers self._host = "https://edith.xiaohongshu.com" @@ -93,7 +93,7 @@ class XiaoHongShuClient(AbstractApiClient): """ # return response.text return_response = kwargs.pop("return_response", False) - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if response.status_code == 471 or response.status_code == 461: @@ -151,7 +151,7 @@ class XiaoHongShuClient(AbstractApiClient): ) async def get_note_media(self, url: str) -> Union[bytes, None]: - async with httpx.AsyncClient(proxies=self.proxies) as client: + async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request("GET", url, timeout=self.timeout) if not response.reason_phrase == "OK": utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 2168239..0e9b0f8 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -328,7 +328,7 @@ class XiaoHongShuCrawler(AbstractCrawler): utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) xhs_client_obj = XiaoHongShuClient( - proxies=httpx_proxy, + proxy=httpx_proxy, headers={ "accept": "application/json, text/plain, */*", "accept-language": "zh-CN,zh;q=0.9", diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py index 5991163..ac74fc8 100644 --- a/media_platform/zhihu/client.py +++ b/media_platform/zhihu/client.py @@ -1,13 +1,12 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 - +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # -*- coding: utf-8 -*- import asyncio @@ -32,16 +31,17 @@ from .help import ZhihuExtractor, sign class ZhiHuClient(AbstractApiClient): + def __init__( - self, - timeout=10, - proxies=None, - *, - headers: Dict[str, str], - playwright_page: Page, - cookie_dict: Dict[str, str], + self, + timeout=10, + proxy=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], ): - self.proxies = proxies + self.proxy = proxy self.timeout = timeout self.default_headers = headers self.cookie_dict = cookie_dict @@ -79,17 +79,14 @@ class ZhiHuClient(AbstractApiClient): # return response.text return_response = kwargs.pop('return_response', False) - async with httpx.AsyncClient(proxies=self.proxies, ) as client: - response = await client.request( - method, url, timeout=self.timeout, - **kwargs - ) + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) if response.status_code != 200: utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}") if response.status_code == 403: raise ForbiddenError(response.text) - elif response.status_code == 404: # 如果一个content没有评论也是404 + elif response.status_code == 404: # 如果一个content没有评论也是404 return {} raise DataFetchError(response.text) @@ -106,7 +103,6 @@ class ZhiHuClient(AbstractApiClient): utils.logger.error(f"[ZhiHuClient.request] Request error: {response.text}") raise DataFetchError(response.text) - async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]: """ GET请求,对请求头签名 @@ -121,11 +117,7 @@ class ZhiHuClient(AbstractApiClient): if isinstance(params, dict): final_uri += '?' + urlencode(params) headers = await self._pre_headers(final_uri) - base_url = ( - zhihu_constant.ZHIHU_URL - if "/p/" not in uri - else zhihu_constant.ZHIHU_ZHUANLAN_URL - ) + base_url = (zhihu_constant.ZHIHU_URL if "/p/" not in uri else zhihu_constant.ZHIHU_ZHUANLAN_URL) return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs) async def pong(self) -> bool: @@ -167,18 +159,17 @@ class ZhiHuClient(AbstractApiClient): Returns: """ - params = { - "include": "email,is_active,is_bind_phone" - } + params = {"include": "email,is_active,is_bind_phone"} return await self.get("/api/v4/me", params) async def get_note_by_keyword( - self, keyword: str, - page: int = 1, - page_size: int = 20, - sort: SearchSort = SearchSort.DEFAULT, - note_type: SearchType = SearchType.DEFAULT, - search_time: SearchTime = SearchTime.DEFAULT + self, + keyword: str, + page: int = 1, + page_size: int = 20, + sort: SearchSort = SearchSort.DEFAULT, + note_type: SearchType = SearchType.DEFAULT, + search_time: SearchTime = SearchTime.DEFAULT, ) -> List[ZhihuContent]: """ 根据关键词搜索 @@ -213,8 +204,14 @@ class ZhiHuClient(AbstractApiClient): utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}") return self._extractor.extract_contents_from_search(search_res) - async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10, - order_by: str = "score") -> Dict: + async def get_root_comments( + self, + content_id: str, + content_type: str, + offset: str = "", + limit: int = 10, + order_by: str = "score", + ) -> Dict: """ 获取内容的一级评论 Args: @@ -238,8 +235,13 @@ class ZhiHuClient(AbstractApiClient): # } # return await self.get(uri, params) - async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10, - order_by: str = "sort") -> Dict: + async def get_child_comments( + self, + root_comment_id: str, + offset: str = "", + limit: int = 10, + order_by: str = "sort", + ) -> Dict: """ 获取一级评论下的子评论 Args: @@ -255,12 +257,16 @@ class ZhiHuClient(AbstractApiClient): params = { "order": order_by, "offset": offset, - "limit": limit + "limit": limit, } return await self.get(uri, params) - async def get_note_all_comments(self, content: ZhihuContent, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[ZhihuComment]: + async def get_note_all_comments( + self, + content: ZhihuContent, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuComment]: """ 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: @@ -295,8 +301,13 @@ class ZhiHuClient(AbstractApiClient): await asyncio.sleep(crawl_interval) return result - async def get_comments_all_sub_comments(self, content: ZhihuContent, comments: List[ZhihuComment], crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[ZhihuComment]: + async def get_comments_all_sub_comments( + self, + content: ZhihuContent, + comments: List[ZhihuComment], + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuComment]: """ 获取指定评论下的所有子评论 Args: @@ -365,7 +376,8 @@ class ZhiHuClient(AbstractApiClient): """ uri = f"/api/v4/members/{url_token}/answers" params = { - "include":"data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,excerpt,paid_info,reaction_instruction,is_labeled,label_info,relationship.is_authorized,voting,is_author,is_thanked,is_nothelp;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;data[*].question.has_publishing_draft,relationship", + "include": + "data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,excerpt,paid_info,reaction_instruction,is_labeled,label_info,relationship.is_authorized,voting,is_author,is_thanked,is_nothelp;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;data[*].question.has_publishing_draft,relationship", "offset": offset, "limit": limit, "order_by": "created" @@ -385,7 +397,8 @@ class ZhiHuClient(AbstractApiClient): """ uri = f"/api/v4/members/{url_token}/articles" params = { - "include":"data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,reaction_instruction,is_labeled,label_info;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;", + "include": + "data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,reaction_instruction,is_labeled,label_info;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;", "offset": offset, "limit": limit, "order_by": "created" @@ -405,15 +418,14 @@ class ZhiHuClient(AbstractApiClient): """ uri = f"/api/v4/members/{url_token}/zvideos" params = { - "include":"similar_zvideo,creation_relationship,reaction_instruction", + "include": "similar_zvideo,creation_relationship,reaction_instruction", "offset": offset, "limit": limit, - "similar_aggregation": "true" + "similar_aggregation": "true", } return await self.get(uri, params) - async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[ZhihuContent]: + async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]: """ 获取创作者的所有回答 Args: @@ -443,9 +455,12 @@ class ZhiHuClient(AbstractApiClient): await asyncio.sleep(crawl_interval) return all_contents - - async def get_all_articles_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[ZhihuContent]: + async def get_all_articles_by_creator( + self, + creator: ZhihuCreator, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuContent]: """ 获取创作者的所有文章 Args: @@ -474,9 +489,12 @@ class ZhiHuClient(AbstractApiClient): await asyncio.sleep(crawl_interval) return all_contents - - async def get_all_videos_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[ZhihuContent]: + async def get_all_videos_by_creator( + self, + creator: ZhihuCreator, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuContent]: """ 获取创作者的所有视频 Args: @@ -505,9 +523,10 @@ class ZhiHuClient(AbstractApiClient): await asyncio.sleep(crawl_interval) return all_contents - async def get_answer_info( - self, question_id: str, answer_id: str + self, + question_id: str, + answer_id: str, ) -> Optional[ZhihuContent]: """ 获取回答信息 diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py index bd533bc..9ef72b6 100644 --- a/media_platform/zhihu/core.py +++ b/media_platform/zhihu/core.py @@ -360,7 +360,7 @@ class ZhihuCrawler(AbstractCrawler): await self.browser_context.cookies() ) zhihu_client_obj = ZhiHuClient( - proxies=httpx_proxy, + proxy=httpx_proxy, headers={ "accept": "*/*", "accept-language": "zh-CN,zh;q=0.9", diff --git a/proxy/base_proxy.py b/proxy/base_proxy.py index 17dc19f..b6f0027 100644 --- a/proxy/base_proxy.py +++ b/proxy/base_proxy.py @@ -32,7 +32,7 @@ class IpGetError(Exception): class ProxyProvider(ABC): @abstractmethod - async def get_proxies(self, num: int) -> List[IpInfoModel]: + async def get_proxy(self, num: int) -> List[IpInfoModel]: """ 获取 IP 的抽象方法,不同的 HTTP 代理商需要实现该方法 :param num: 提取的 IP 数量 diff --git a/proxy/providers/jishu_http_proxy.py b/proxy/providers/jishu_http_proxy.py index baba792..1a84c08 100644 --- a/proxy/providers/jishu_http_proxy.py +++ b/proxy/providers/jishu_http_proxy.py @@ -1,13 +1,12 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 - +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com @@ -25,6 +24,7 @@ from tools import utils class JiSuHttpProxy(ProxyProvider): + def __init__(self, key: str, crypto: str, time_validity_period: int): """ 极速HTTP 代理IP实现 @@ -44,7 +44,7 @@ class JiSuHttpProxy(ProxyProvider): } self.ip_cache = IpCache() - async def get_proxies(self, num: int) -> List[IpInfoModel]: + async def get_proxy(self, num: int) -> List[IpInfoModel]: """ :param num: :return: @@ -61,9 +61,10 @@ class JiSuHttpProxy(ProxyProvider): ip_infos = [] async with httpx.AsyncClient() as client: url = self.api_path + "/fetchips" + '?' + urlencode(self.params) - utils.logger.info(f"[JiSuHttpProxy.get_proxies] get ip proxy url:{url}") + utils.logger.info(f"[JiSuHttpProxy.get_proxy] get ip proxy url:{url}") response = await client.get(url, headers={ - "User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler"}) + "User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler", + }) res_dict: Dict = response.json() if res_dict.get("code") == 0: data: List[Dict] = res_dict.get("data") @@ -74,7 +75,7 @@ class JiSuHttpProxy(ProxyProvider): port=ip_item.get("port"), user=ip_item.get("user"), password=ip_item.get("pass"), - expired_time_ts=utils.get_unix_time_from_time_str(ip_item.get("expire")) + expired_time_ts=utils.get_unix_time_from_time_str(ip_item.get("expire")), ) ip_key = f"JISUHTTP_{ip_info_model.ip}_{ip_info_model.port}_{ip_info_model.user}_{ip_info_model.password}" ip_value = ip_info_model.json() diff --git a/proxy/providers/kuaidl_proxy.py b/proxy/providers/kuaidl_proxy.py index 9578588..8ca1062 100644 --- a/proxy/providers/kuaidl_proxy.py +++ b/proxy/providers/kuaidl_proxy.py @@ -80,7 +80,7 @@ class KuaiDaiLiProxy(ProxyProvider): "f_et": 1, } - async def get_proxies(self, num: int) -> List[IpInfoModel]: + async def get_proxy(self, num: int) -> List[IpInfoModel]: """ 快代理实现 Args: diff --git a/proxy/proxy_ip_pool.py b/proxy/proxy_ip_pool.py index a03c17b..956739c 100644 --- a/proxy/proxy_ip_pool.py +++ b/proxy/proxy_ip_pool.py @@ -1,13 +1,12 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 - +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com @@ -28,6 +27,7 @@ from .types import IpInfoModel, ProviderNameEnum class ProxyIpPool: + def __init__(self, ip_pool_count: int, enable_validate_ip: bool, ip_provider: ProxyProvider) -> None: """ @@ -48,7 +48,7 @@ class ProxyIpPool: Returns: """ - self.proxy_list = await self.ip_provider.get_proxies(self.ip_pool_count) + self.proxy_list = await self.ip_provider.get_proxy(self.ip_pool_count) async def _is_valid_proxy(self, proxy: IpInfoModel) -> bool: """ @@ -59,9 +59,9 @@ class ProxyIpPool: utils.logger.info(f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} is it valid ") try: httpx_proxy = { - f"{proxy.protocol}": f"http://{proxy.user}:{proxy.password}@{proxy.ip}:{proxy.port}" + f"{proxy.protocol}": f"http://{proxy.user}:{proxy.password}@{proxy.ip}:{proxy.port}", } - async with httpx.AsyncClient(proxies=httpx_proxy) as client: + async with httpx.AsyncClient(proxy=httpx_proxy) as client: response = await client.get(self.valid_ip_url) if response.status_code == 200: return True @@ -81,7 +81,7 @@ class ProxyIpPool: await self._reload_proxies() proxy = random.choice(self.proxy_list) - self.proxy_list.remove(proxy) # 取出来一个IP就应该移出掉 + self.proxy_list.remove(proxy) # 取出来一个IP就应该移出掉 if self.enable_validate_ip: if not await self._is_valid_proxy(proxy): raise Exception("[ProxyIpPool.get_proxy] current ip invalid and again get it") @@ -98,7 +98,7 @@ class ProxyIpPool: IpProxyProvider: Dict[str, ProxyProvider] = { ProviderNameEnum.JISHU_HTTP_PROVIDER.value: new_jisu_http_proxy(), - ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy() + ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(), } @@ -109,10 +109,11 @@ async def create_ip_pool(ip_pool_count: int, enable_validate_ip: bool) -> ProxyI :param enable_validate_ip: 是否开启验证IP代理 :return: """ - pool = ProxyIpPool(ip_pool_count=ip_pool_count, - enable_validate_ip=enable_validate_ip, - ip_provider=IpProxyProvider.get(config.IP_PROXY_PROVIDER_NAME) - ) + pool = ProxyIpPool( + ip_pool_count=ip_pool_count, + enable_validate_ip=enable_validate_ip, + ip_provider=IpProxyProvider.get(config.IP_PROXY_PROVIDER_NAME), + ) await pool.load_proxies() return pool diff --git a/pyproject.toml b/pyproject.toml index 3ac862a..cc36c70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "aiomysql==0.2.0", "aiosqlite>=0.21.0", "fastapi==0.110.2", - "httpx==0.24.0", + "httpx==0.28.1", "jieba==0.42.1", "matplotlib==3.9.0", "opencv-python>=4.11.0.86", diff --git a/requirements.txt b/requirements.txt index 9564bcf..4c4aff7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -httpx==0.24.0 +httpx==0.28.1 Pillow==9.5.0 playwright==1.45.0 tenacity==8.2.2