mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-06 18:07:26 +08:00
feat: 知乎支持(关键词、评论)
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
# @Desc : 爬虫相关的工具函数
|
||||
|
||||
import base64
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
from io import BytesIO
|
||||
@@ -39,6 +40,28 @@ async def find_login_qrcode(page: Page, selector: str) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
async def find_qrcode_img_from_canvas(page: Page, canvas_selector: str) -> str:
|
||||
"""
|
||||
find qrcode image from canvas element
|
||||
Args:
|
||||
page:
|
||||
canvas_selector:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
# 等待Canvas元素加载完成
|
||||
canvas = await page.wait_for_selector(canvas_selector)
|
||||
|
||||
# 截取Canvas元素的截图
|
||||
screenshot = await canvas.screenshot()
|
||||
|
||||
# 将截图转换为base64格式
|
||||
base64_image = base64.b64encode(screenshot).decode('utf-8')
|
||||
return base64_image
|
||||
|
||||
|
||||
def show_qrcode(qr_code) -> None: # type: ignore
|
||||
"""parse base64 encode qrcode image and show it"""
|
||||
if "," in qr_code:
|
||||
@@ -147,8 +170,12 @@ def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||
}
|
||||
return playwright_proxy, httpx_proxy
|
||||
|
||||
|
||||
def extract_text_from_html(html: str) -> str:
|
||||
"""Extract text from HTML, removing all tags."""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
# Remove script and style elements
|
||||
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
|
||||
# Remove all other tags
|
||||
|
||||
Reference in New Issue
Block a user