feat: 知乎支持(关键词、评论)

This commit is contained in:
Relakkes
2024-09-08 00:00:04 +08:00
parent 131b5697d5
commit b7e57da0d2
20 changed files with 1776 additions and 15 deletions

View File

@@ -4,6 +4,7 @@
# @Desc : 爬虫相关的工具函数
import base64
import json
import random
import re
from io import BytesIO
@@ -39,6 +40,28 @@ async def find_login_qrcode(page: Page, selector: str) -> str:
return ""
async def find_qrcode_img_from_canvas(page: Page, canvas_selector: str) -> str:
"""
find qrcode image from canvas element
Args:
page:
canvas_selector:
Returns:
"""
# 等待Canvas元素加载完成
canvas = await page.wait_for_selector(canvas_selector)
# 截取Canvas元素的截图
screenshot = await canvas.screenshot()
# 将截图转换为base64格式
base64_image = base64.b64encode(screenshot).decode('utf-8')
return base64_image
def show_qrcode(qr_code) -> None: # type: ignore
"""parse base64 encode qrcode image and show it"""
if "," in qr_code:
@@ -147,8 +170,12 @@ def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[Dict]]:
}
return playwright_proxy, httpx_proxy
def extract_text_from_html(html: str) -> str:
"""Extract text from HTML, removing all tags."""
if not html:
return ""
# Remove script and style elements
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
# Remove all other tags