fix: 评论移除html标签内容

2026-06-06 18:07:26 +08:00 · 2024-08-07 02:39:50 +08:00
parent 026d81e131
commit 1208682a9a
2 changed files with 11 additions and 3 deletions
--- a/tools/crawler_util.py
+++ b/tools/crawler_util.py
@@ -146,4 +146,12 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio
    httpx_proxy = {
        f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
    }
-    return playwright_proxy, httpx_proxy
+    return playwright_proxy, httpx_proxy
+
+def extract_text_from_html(html: str) -> str:
+    """Extract text from HTML, removing all tags."""
+    # Remove script and style elements
+    clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
+    # Remove all other tags
+    clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
+    return clean_text