fix: xhs帖子详情问题更新

2026-06-05 09:27:25 +08:00 · 2024-10-20 00:59:08 +08:00
parent 9fe3e47b0f
commit 03e393949a
6 changed files with 85 additions and 36 deletions
--- a/tools/crawler_util.py
+++ b/tools/crawler_util.py
@@ -18,6 +18,8 @@ import base64
 import json
 import random
 import re
+import urllib
+import urllib.parse
 from io import BytesIO
 from typing import Dict, List, Optional, Tuple

@@ -192,3 +194,12 @@ def extract_text_from_html(html: str) -> str:
    # Remove all other tags
    clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
    return clean_text
+
+def extract_url_params_to_dict(url: str) -> Dict:
+    """Extract URL parameters to dict"""
+    url_params_dict = dict()
+    if not url:
+        return url_params_dict
+    parsed_url = urllib.parse.urlparse(url)
+    url_params_dict = dict(urllib.parse.parse_qsl(parsed_url.query))
+    return url_params_dict