From 7484156f0275a78f6bb11237f95be8189d9bcfca Mon Sep 17 00:00:00 2001 From: ouzhuowei <190020754@qq.com> Date: Tue, 3 Feb 2026 11:24:22 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=95=B0=E6=8D=AE=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=E8=B7=AF=E5=BE=84,=E9=BB=98=E8=AE=A4=E4=B8=8D?= =?UTF-8?q?=E6=8C=87=E5=AE=9A=E5=88=99=E4=BF=9D=E5=AD=98=E5=88=B0data?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=A4=B9=E4=B8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: ouzhuowei <190020754@qq.com> --- cmd_arg/arg.py | 9 +++++++++ config/base_config.py | 3 +++ tools/async_file_writer.py | 10 ++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 266dd84..47b53ff 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Performance Configuration", ), ] = config.MAX_CONCURRENCY_NUM, + save_data_path: Annotated[ + str, + typer.Option( + "--save_data_path", + help="Data save path, default is empty and will save to data folder", + rich_help_panel="Storage Configuration", + ), + ] = config.SAVE_DATA_PATH, ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.COOKIES = cookies config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes config.MAX_CONCURRENCY_NUM = max_concurrency_num + config.SAVE_DATA_PATH = save_data_path # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/config/base_config.py b/config/base_config.py index 961d216..88a3a6c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True # 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。 SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres +# 数据保存路径,默认不指定,则保存到data文件夹下 +SAVE_DATA_PATH = "" + # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py index dada0d6..a52e284 100644 --- a/tools/async_file_writer.py +++ b/tools/async_file_writer.py @@ -35,7 +35,10 @@ class AsyncFileWriter: self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None def _get_file_path(self, file_type: str, item_type: str) -> str: - base_path = f"data/{self.platform}/{file_type}" + if config.SAVE_DATA_PATH: + base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}" + else: + base_path = f"data/{self.platform}/{file_type}" pathlib.Path(base_path).mkdir(parents=True, exist_ok=True) file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}" return f"{base_path}/{file_name}" @@ -113,7 +116,10 @@ class AsyncFileWriter: return # Generate wordcloud - words_base_path = f"data/{self.platform}/words" + if config.SAVE_DATA_PATH: + words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words" + else: + words_base_path = f"data/{self.platform}/words" pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True) words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"