diff --git a/cache/redis_cache.py b/cache/redis_cache.py index 1ea4c0b..091c57d 100644 --- a/cache/redis_cache.py +++ b/cache/redis_cache.py @@ -28,6 +28,7 @@ import time from typing import Any, List from redis import Redis +from redis.exceptions import ResponseError from cache.abs_cache import AbstractCache from config import db_config @@ -76,8 +77,25 @@ class RedisCache(AbstractCache): def keys(self, pattern: str) -> List[str]: """ Get all keys matching the pattern + First try KEYS command, if not supported fallback to SCAN """ - return [key.decode() for key in self._redis_client.keys(pattern)] + try: + # Try KEYS command first (faster for standard Redis) + return [key.decode() if isinstance(key, bytes) else key for key in self._redis_client.keys(pattern)] + except ResponseError as e: + # If KEYS is not supported (e.g., Redis Cluster or cloud Redis), use SCAN + if "unknown command" in str(e).lower() or "keys" in str(e).lower(): + keys_list: List[str] = [] + cursor = 0 + while True: + cursor, keys = self._redis_client.scan(cursor=cursor, match=pattern, count=100) + keys_list.extend([key.decode() if isinstance(key, bytes) else key for key in keys]) + if cursor == 0: + break + return keys_list + else: + # Re-raise if it's a different error + raise if __name__ == '__main__': diff --git a/tools/crawler_util.py b/tools/crawler_util.py index b010993..7cab0e0 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -180,11 +180,19 @@ def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[str]]: from proxy.proxy_ip_pool import IpInfoModel ip_proxy_info = cast(IpInfoModel, ip_proxy_info) + # Playwright proxy server should be in format "host:port" without protocol prefix + # Remove protocol prefix if present (http:// or https://) + server = f"{ip_proxy_info.ip}:{ip_proxy_info.port}" + playwright_proxy = { - "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", - "username": ip_proxy_info.user, - "password": ip_proxy_info.password, + "server": server, } + + # Only add username and password if they are not empty + if ip_proxy_info.user and ip_proxy_info.password: + playwright_proxy["username"] = ip_proxy_info.user + playwright_proxy["password"] = ip_proxy_info.password + # httpx 0.28.1 requires passing proxy URL string directly, not a dictionary if ip_proxy_info.user and ip_proxy_info.password: httpx_proxy = f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"