新增数据保存路径,默认不指定则保存到data文件夹下

Co-Authored-By: ouzhuowei <190020754@qq.com>
2026-05-08 03:27:36 +08:00 · 2026-02-03 11:24:22 +08:00
parent 51a7d94de8
commit 7484156f02
3 changed files with 20 additions and 2 deletions
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
                rich_help_panel="Performance Configuration",
            ),
        ] = config.MAX_CONCURRENCY_NUM,
+        save_data_path: Annotated[
+            str,
+            typer.Option(
+                "--save_data_path",
+                help="Data save path, default is empty and will save to data folder",
+                rich_help_panel="Storage Configuration",
+            ),
+        ] = config.SAVE_DATA_PATH,
    ) -> SimpleNamespace:
        """MediaCrawler 命令行入口"""

@@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
        config.COOKIES = cookies
        config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
        config.MAX_CONCURRENCY_NUM = max_concurrency_num
+        config.SAVE_DATA_PATH = save_data_path

        # Set platform-specific ID lists for detail/creator mode
        if specified_id_list:
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True
 # 数据保存类型选项配置,支持六种类型：csv、db、json、sqlite、excel、postgres, 最好保存到DB，有排重的功能。
 SAVE_DATA_OPTION = "json"  # csv or db or json or sqlite or excel or postgres

+# 数据保存路径,默认不指定,则保存到data文件夹下
+SAVE_DATA_PATH = ""
+
 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name

--- a/tools/async_file_writer.py
+++ b/tools/async_file_writer.py
@@ -35,7 +35,10 @@ class AsyncFileWriter:
        self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None

    def _get_file_path(self, file_type: str, item_type: str) -> str:
-        base_path = f"data/{self.platform}/{file_type}"
+        if config.SAVE_DATA_PATH:
+            base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}"
+        else:
+            base_path = f"data/{self.platform}/{file_type}"
        pathlib.Path(base_path).mkdir(parents=True, exist_ok=True)
        file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}"
        return f"{base_path}/{file_name}"
@@ -113,7 +116,10 @@ class AsyncFileWriter:
                return

            # Generate wordcloud
-            words_base_path = f"data/{self.platform}/words"
+            if config.SAVE_DATA_PATH:
+                words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words"
+            else:
+                words_base_path = f"data/{self.platform}/words"
            pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
            words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"