From 90280a261af2e9a1a7a4612f59c664637350b0ed Mon Sep 17 00:00:00 2001 From: ouzhuowei <190020754@qq.com> Date: Fri, 6 Feb 2026 09:58:37 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E8=A1=A5=E5=85=85=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E7=9A=84arp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: ouzhuowei <190020754@qq.com> --- cmd_arg/arg.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 47b53ff..e27d45a 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -274,12 +274,38 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Storage Configuration", ), ] = config.SAVE_DATA_PATH, + enable_ip_proxy: Annotated[ + str, + typer.Option( + "--enable_ip_proxy", + help="Whether to enable IP proxy, supports yes/true/t/y/1 or no/false/f/n/0", + rich_help_panel="Proxy Configuration", + show_default=True, + ), + ] = str(config.ENABLE_IP_PROXY), + ip_proxy_pool_count: Annotated[ + int, + typer.Option( + "--ip_proxy_pool_count", + help="IP proxy pool count", + rich_help_panel="Proxy Configuration", + ), + ] = config.IP_PROXY_POOL_COUNT, + ip_proxy_provider_name: Annotated[ + str, + typer.Option( + "--ip_proxy_provider_name", + help="IP proxy provider name (kuaidaili | wandouhttp)", + rich_help_panel="Proxy Configuration", + ), + ] = config.IP_PROXY_PROVIDER_NAME, ) -> SimpleNamespace: """MediaCrawler 命令行入口""" enable_comment = _to_bool(get_comment) enable_sub_comment = _to_bool(get_sub_comment) enable_headless = _to_bool(headless) + enable_ip_proxy_value = _to_bool(enable_ip_proxy) init_db_value = init_db.value if init_db else None # Parse specified_id and creator_id into lists @@ -301,6 +327,9 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes config.MAX_CONCURRENCY_NUM = max_concurrency_num config.SAVE_DATA_PATH = save_data_path + config.ENABLE_IP_PROXY = enable_ip_proxy_value + config.IP_PROXY_POOL_COUNT = ip_proxy_pool_count + config.IP_PROXY_PROVIDER_NAME = ip_proxy_provider_name # Set platform-specific ID lists for detail/creator mode if specified_id_list: From 30cf16af0cde8babf71f579fd904533aaed5dca9 Mon Sep 17 00:00:00 2001 From: ouzhuowei <190020754@qq.com> Date: Fri, 6 Feb 2026 12:33:35 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E5=AD=98=E5=82=A8=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: ouzhuowei <190020754@qq.com> --- cmd_arg/arg.py | 39 +++++++++++++++++++++++++++++++++++ config/base_config.py | 8 ++++++++ tools/utils.py | 47 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index e27d45a..1840a50 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -84,6 +84,16 @@ class InitDbOptionEnum(str, Enum): POSTGRES = "postgres" +class LogLevelEnum(str, Enum): + """Log level enumeration""" + + DEBUG = "DEBUG" + INFO = "INFO" + WARNING = "WARNING" + ERROR = "ERROR" + CRITICAL = "CRITICAL" + + def _to_bool(value: bool | str) -> bool: if isinstance(value, bool): return value @@ -299,6 +309,31 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Proxy Configuration", ), ] = config.IP_PROXY_PROVIDER_NAME, + log_save_enable: Annotated[ + str, + typer.Option( + "--log_save_enable", + help="Whether to save logs to file, supports yes/true/t/y/1 or no/false/f/n/0", + rich_help_panel="Log Configuration", + show_default=True, + ), + ] = str(config.LOG_SAVE_ENABLE), + log_save_path: Annotated[ + str, + typer.Option( + "--log_save_path", + help="Log file save path, default is ./logs", + rich_help_panel="Log Configuration", + ), + ] = config.LOG_SAVE_PATH, + log_save_level: Annotated[ + LogLevelEnum, + typer.Option( + "--log_save_level", + help="Log save level (DEBUG | INFO | WARNING | ERROR | CRITICAL)", + rich_help_panel="Log Configuration", + ), + ] = _coerce_enum(LogLevelEnum, config.LOG_SAVE_LEVEL, LogLevelEnum.INFO), ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -306,6 +341,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): enable_sub_comment = _to_bool(get_sub_comment) enable_headless = _to_bool(headless) enable_ip_proxy_value = _to_bool(enable_ip_proxy) + enable_log_save = _to_bool(log_save_enable) init_db_value = init_db.value if init_db else None # Parse specified_id and creator_id into lists @@ -330,6 +366,9 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.ENABLE_IP_PROXY = enable_ip_proxy_value config.IP_PROXY_POOL_COUNT = ip_proxy_pool_count config.IP_PROXY_PROVIDER_NAME = ip_proxy_provider_name + config.LOG_SAVE_ENABLE = enable_log_save + config.LOG_SAVE_PATH = log_save_path + config.LOG_SAVE_LEVEL = log_save_level.value # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/config/base_config.py b/config/base_config.py index 88a3a6c..ffafe99 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -120,6 +120,14 @@ FONT_PATH = "./docs/STZHONGS.TTF" # 爬取间隔时间 CRAWLER_MAX_SLEEP_SEC = 2 +# 日志配置 +# 是否保存日志 +LOG_SAVE_ENABLE = False +# 日志保存路径 +LOG_SAVE_PATH = "./logs" +# 日志保存级别 +LOG_SAVE_LEVEL = "INFO" + from .bilibili_config import * from .xhs_config import * from .dy_config import * diff --git a/tools/utils.py b/tools/utils.py index fcee910..14a2089 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -20,6 +20,9 @@ import argparse import logging +import os +from logging.handlers import RotatingFileHandler +from datetime import datetime from .crawler_util import * from .slider_util import * @@ -27,15 +30,55 @@ from .time_util import * def init_loging_config(): + # 导入配置 + try: + from config.base_config import LOG_SAVE_ENABLE, LOG_SAVE_PATH, LOG_SAVE_LEVEL + except ImportError: + LOG_SAVE_ENABLE = False + LOG_SAVE_PATH = "./logs" + LOG_SAVE_LEVEL = "INFO" + level = logging.INFO + log_format = "%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s" + date_format = '%Y-%m-%d %H:%M:%S' + + # 配置基础日志 logging.basicConfig( level=level, - format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s", - datefmt='%Y-%m-%d %H:%M:%S' + format=log_format, + datefmt=date_format ) _logger = logging.getLogger("MediaCrawler") _logger.setLevel(level) + # 如果启用日志保存,添加文件处理器 + if LOG_SAVE_ENABLE and LOG_SAVE_PATH: + try: + # 确保日志目录存在 + log_dir = os.path.abspath(LOG_SAVE_PATH) + os.makedirs(log_dir, exist_ok=True) + + # 日志文件名:按日期命名 + log_filename = os.path.join(log_dir, f"mediacrawler-{datetime.now().strftime('%Y-%m-%d')}.log") + + # 转换日志级别字符串为logging级别 + file_level = getattr(logging, LOG_SAVE_LEVEL.upper(), logging.INFO) + + # 创建文件处理器 + file_handler = RotatingFileHandler( + log_filename, + encoding='utf-8' + ) + file_handler.setLevel(file_level) + file_handler.setFormatter(logging.Formatter(log_format, date_format)) + + # 添加到logger + _logger.addHandler(file_handler) + + except Exception as e: + # 如果文件日志配置失败,不影响控制台日志 + _logger.warning(f"日志文件保存配置失败: {e}") + # Disable httpx INFO level logs logging.getLogger("httpx").setLevel(logging.WARNING) From 212276bc30e40b07f258c835069c96495b52dd0a Mon Sep 17 00:00:00 2001 From: ouzhuowei <190020754@qq.com> Date: Tue, 10 Feb 2026 15:03:40 +0800 Subject: [PATCH 3/3] =?UTF-8?q?Revert=20"=E6=96=B0=E5=A2=9E=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E5=AD=98=E5=82=A8=E9=80=BB=E8=BE=91"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 30cf16af0cde8babf71f579fd904533aaed5dca9. Co-Authored-By: ouzhuowei <190020754@qq.com> --- cmd_arg/arg.py | 39 ----------------------------------- config/base_config.py | 8 -------- tools/utils.py | 47 ++----------------------------------------- 3 files changed, 2 insertions(+), 92 deletions(-) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 1840a50..e27d45a 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -84,16 +84,6 @@ class InitDbOptionEnum(str, Enum): POSTGRES = "postgres" -class LogLevelEnum(str, Enum): - """Log level enumeration""" - - DEBUG = "DEBUG" - INFO = "INFO" - WARNING = "WARNING" - ERROR = "ERROR" - CRITICAL = "CRITICAL" - - def _to_bool(value: bool | str) -> bool: if isinstance(value, bool): return value @@ -309,31 +299,6 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Proxy Configuration", ), ] = config.IP_PROXY_PROVIDER_NAME, - log_save_enable: Annotated[ - str, - typer.Option( - "--log_save_enable", - help="Whether to save logs to file, supports yes/true/t/y/1 or no/false/f/n/0", - rich_help_panel="Log Configuration", - show_default=True, - ), - ] = str(config.LOG_SAVE_ENABLE), - log_save_path: Annotated[ - str, - typer.Option( - "--log_save_path", - help="Log file save path, default is ./logs", - rich_help_panel="Log Configuration", - ), - ] = config.LOG_SAVE_PATH, - log_save_level: Annotated[ - LogLevelEnum, - typer.Option( - "--log_save_level", - help="Log save level (DEBUG | INFO | WARNING | ERROR | CRITICAL)", - rich_help_panel="Log Configuration", - ), - ] = _coerce_enum(LogLevelEnum, config.LOG_SAVE_LEVEL, LogLevelEnum.INFO), ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -341,7 +306,6 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): enable_sub_comment = _to_bool(get_sub_comment) enable_headless = _to_bool(headless) enable_ip_proxy_value = _to_bool(enable_ip_proxy) - enable_log_save = _to_bool(log_save_enable) init_db_value = init_db.value if init_db else None # Parse specified_id and creator_id into lists @@ -366,9 +330,6 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.ENABLE_IP_PROXY = enable_ip_proxy_value config.IP_PROXY_POOL_COUNT = ip_proxy_pool_count config.IP_PROXY_PROVIDER_NAME = ip_proxy_provider_name - config.LOG_SAVE_ENABLE = enable_log_save - config.LOG_SAVE_PATH = log_save_path - config.LOG_SAVE_LEVEL = log_save_level.value # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/config/base_config.py b/config/base_config.py index ffafe99..88a3a6c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -120,14 +120,6 @@ FONT_PATH = "./docs/STZHONGS.TTF" # 爬取间隔时间 CRAWLER_MAX_SLEEP_SEC = 2 -# 日志配置 -# 是否保存日志 -LOG_SAVE_ENABLE = False -# 日志保存路径 -LOG_SAVE_PATH = "./logs" -# 日志保存级别 -LOG_SAVE_LEVEL = "INFO" - from .bilibili_config import * from .xhs_config import * from .dy_config import * diff --git a/tools/utils.py b/tools/utils.py index 14a2089..fcee910 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -20,9 +20,6 @@ import argparse import logging -import os -from logging.handlers import RotatingFileHandler -from datetime import datetime from .crawler_util import * from .slider_util import * @@ -30,55 +27,15 @@ from .time_util import * def init_loging_config(): - # 导入配置 - try: - from config.base_config import LOG_SAVE_ENABLE, LOG_SAVE_PATH, LOG_SAVE_LEVEL - except ImportError: - LOG_SAVE_ENABLE = False - LOG_SAVE_PATH = "./logs" - LOG_SAVE_LEVEL = "INFO" - level = logging.INFO - log_format = "%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s" - date_format = '%Y-%m-%d %H:%M:%S' - - # 配置基础日志 logging.basicConfig( level=level, - format=log_format, - datefmt=date_format + format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s", + datefmt='%Y-%m-%d %H:%M:%S' ) _logger = logging.getLogger("MediaCrawler") _logger.setLevel(level) - # 如果启用日志保存,添加文件处理器 - if LOG_SAVE_ENABLE and LOG_SAVE_PATH: - try: - # 确保日志目录存在 - log_dir = os.path.abspath(LOG_SAVE_PATH) - os.makedirs(log_dir, exist_ok=True) - - # 日志文件名:按日期命名 - log_filename = os.path.join(log_dir, f"mediacrawler-{datetime.now().strftime('%Y-%m-%d')}.log") - - # 转换日志级别字符串为logging级别 - file_level = getattr(logging, LOG_SAVE_LEVEL.upper(), logging.INFO) - - # 创建文件处理器 - file_handler = RotatingFileHandler( - log_filename, - encoding='utf-8' - ) - file_handler.setLevel(file_level) - file_handler.setFormatter(logging.Formatter(log_format, date_format)) - - # 添加到logger - _logger.addHandler(file_handler) - - except Exception as e: - # 如果文件日志配置失败,不影响控制台日志 - _logger.warning(f"日志文件保存配置失败: {e}") - # Disable httpx INFO level logs logging.getLogger("httpx").setLevel(logging.WARNING)