mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-04-21 03:07:37 +08:00
fix: 评论移除html标签内容
This commit is contained in:
@@ -146,4 +146,12 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio
|
||||
httpx_proxy = {
|
||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
||||
}
|
||||
return playwright_proxy, httpx_proxy
|
||||
return playwright_proxy, httpx_proxy
|
||||
|
||||
def extract_text_from_html(html: str) -> str:
|
||||
"""Extract text from HTML, removing all tags."""
|
||||
# Remove script and style elements
|
||||
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
|
||||
# Remove all other tags
|
||||
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
|
||||
return clean_text
|
||||
|
||||
Reference in New Issue
Block a user