mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-06 01:47:26 +08:00
refactor: weibo login
This commit is contained in:
@@ -12,7 +12,7 @@
|
|||||||
# 微博平台配置
|
# 微博平台配置
|
||||||
|
|
||||||
# 搜索类型,具体的枚举值在media_platform/weibo/field.py中
|
# 搜索类型,具体的枚举值在media_platform/weibo/field.py中
|
||||||
WEIBO_SEARCH_TYPE = "popular"
|
WEIBO_SEARCH_TYPE = "default"
|
||||||
|
|
||||||
# 指定微博ID列表
|
# 指定微博ID列表
|
||||||
WEIBO_SPECIFIED_ID_LIST = [
|
WEIBO_SPECIFIED_ID_LIST = [
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ from urllib.parse import parse_qs, unquote, urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from httpx import Response
|
from httpx import Response
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from tools import utils
|
from tools import utils
|
||||||
@@ -50,6 +51,7 @@ class WeiboClient:
|
|||||||
self.cookie_dict = cookie_dict
|
self.cookie_dict = cookie_dict
|
||||||
self._image_agent_host = "https://i1.wp.com/"
|
self._image_agent_host = "https://i1.wp.com/"
|
||||||
|
|
||||||
|
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
|
||||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||||
enable_return_response = kwargs.pop("return_response", False)
|
enable_return_response = kwargs.pop("return_response", False)
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
@@ -58,7 +60,12 @@ class WeiboClient:
|
|||||||
if enable_return_response:
|
if enable_return_response:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
data: Dict = response.json()
|
try:
|
||||||
|
data: Dict = response.json()
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err code: {response.status_code} res:{response.text}")
|
||||||
|
raise DataFetchError(f"get response code error: {response.status_code}")
|
||||||
|
|
||||||
ok_code = data.get("ok")
|
ok_code = data.get("ok")
|
||||||
if ok_code == 0: # response error
|
if ok_code == 0: # response error
|
||||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||||
@@ -99,10 +106,24 @@ class WeiboClient:
|
|||||||
ping_flag = False
|
ping_flag = False
|
||||||
return ping_flag
|
return ping_flag
|
||||||
|
|
||||||
async def update_cookies(self, browser_context: BrowserContext):
|
async def update_cookies(self, browser_context: BrowserContext, urls: Optional[List[str]] = None):
|
||||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
"""
|
||||||
|
Update cookies from browser context
|
||||||
|
:param browser_context: Browser context
|
||||||
|
:param urls: Optional list of URLs to filter cookies (e.g., ["https://m.weibo.cn"])
|
||||||
|
If provided, only cookies for these URLs will be retrieved
|
||||||
|
"""
|
||||||
|
if urls:
|
||||||
|
cookies = await browser_context.cookies(urls=urls)
|
||||||
|
utils.logger.info(f"[WeiboClient.update_cookies] Updating cookies for specific URLs: {urls}")
|
||||||
|
else:
|
||||||
|
cookies = await browser_context.cookies()
|
||||||
|
utils.logger.info("[WeiboClient.update_cookies] Updating all cookies")
|
||||||
|
|
||||||
|
cookie_str, cookie_dict = utils.convert_cookies(cookies)
|
||||||
self.headers["Cookie"] = cookie_str
|
self.headers["Cookie"] = cookie_str
|
||||||
self.cookie_dict = cookie_dict
|
self.cookie_dict = cookie_dict
|
||||||
|
utils.logger.info(f"[WeiboClient.update_cookies] Cookie updated successfully, total: {len(cookie_dict)} cookies")
|
||||||
|
|
||||||
async def get_note_by_keyword(
|
async def get_note_by_keyword(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -83,7 +83,8 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
|
|
||||||
self.context_page = await self.browser_context.new_page()
|
self.context_page = await self.browser_context.new_page()
|
||||||
await self.context_page.goto(self.mobile_index_url)
|
await self.context_page.goto(self.index_url)
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
# Create a client to interact with the xiaohongshu website.
|
# Create a client to interact with the xiaohongshu website.
|
||||||
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
||||||
@@ -100,8 +101,12 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||||
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
||||||
await self.context_page.goto(self.mobile_index_url)
|
await self.context_page.goto(self.mobile_index_url)
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(3)
|
||||||
await self.wb_client.update_cookies(browser_context=self.browser_context)
|
# 只获取移动端的 cookies,避免 PC 端和移动端 cookies 混淆
|
||||||
|
await self.wb_client.update_cookies(
|
||||||
|
browser_context=self.browser_context,
|
||||||
|
urls=[self.mobile_index_url]
|
||||||
|
)
|
||||||
|
|
||||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||||
if config.CRAWLER_TYPE == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
|
|||||||
@@ -120,14 +120,7 @@ def get_user_agent() -> str:
|
|||||||
|
|
||||||
def get_mobile_user_agent() -> str:
|
def get_mobile_user_agent() -> str:
|
||||||
ua_list = [
|
ua_list = [
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Mobile/15E148 Safari/604.1"
|
||||||
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
|
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1",
|
|
||||||
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1",
|
|
||||||
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
|
|
||||||
"Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36"
|
|
||||||
]
|
]
|
||||||
return random.choice(ua_list)
|
return random.choice(ua_list)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user