diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 266dd84..47b53ff 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Performance Configuration", ), ] = config.MAX_CONCURRENCY_NUM, + save_data_path: Annotated[ + str, + typer.Option( + "--save_data_path", + help="Data save path, default is empty and will save to data folder", + rich_help_panel="Storage Configuration", + ), + ] = config.SAVE_DATA_PATH, ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.COOKIES = cookies config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes config.MAX_CONCURRENCY_NUM = max_concurrency_num + config.SAVE_DATA_PATH = save_data_path # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/config/base_config.py b/config/base_config.py index 961d216..88a3a6c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True # 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。 SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres +# 数据保存路径,默认不指定,则保存到data文件夹下 +SAVE_DATA_PATH = "" + # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py index dada0d6..a52e284 100644 --- a/tools/async_file_writer.py +++ b/tools/async_file_writer.py @@ -35,7 +35,10 @@ class AsyncFileWriter: self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None def _get_file_path(self, file_type: str, item_type: str) -> str: - base_path = f"data/{self.platform}/{file_type}" + if config.SAVE_DATA_PATH: + base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}" + else: + base_path = f"data/{self.platform}/{file_type}" pathlib.Path(base_path).mkdir(parents=True, exist_ok=True) file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}" return f"{base_path}/{file_name}" @@ -113,7 +116,10 @@ class AsyncFileWriter: return # Generate wordcloud - words_base_path = f"data/{self.platform}/words" + if config.SAVE_DATA_PATH: + words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words" + else: + words_base_path = f"data/{self.platform}/words" pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True) words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"