Merge pull request #825 from ouzhuowei/add_save_data_path

新增数据保存路径,默认不指定则保存到data文件夹下
2026-06-24 09:44:56 +08:00 · 2026-02-04 18:03:22 +08:00
parent c309871485 2a0d1fd69f
commit 4ad065ce9a
8 changed files with 59 additions and 9 deletions
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
                rich_help_panel="Performance Configuration",
            ),
        ] = config.MAX_CONCURRENCY_NUM,
+        save_data_path: Annotated[
+            str,
+            typer.Option(
+                "--save_data_path",
+                help="Data save path, default is empty and will save to data folder",
+                rich_help_panel="Storage Configuration",
+            ),
+        ] = config.SAVE_DATA_PATH,
    ) -> SimpleNamespace:
        """MediaCrawler 命令行入口"""

@@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
        config.COOKIES = cookies
        config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
        config.MAX_CONCURRENCY_NUM = max_concurrency_num
+        config.SAVE_DATA_PATH = save_data_path

        # Set platform-specific ID lists for detail/creator mode
        if specified_id_list:
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True
 # 数据保存类型选项配置,支持六种类型：csv、db、json、sqlite、excel、postgres, 最好保存到DB，有排重的功能。
 SAVE_DATA_OPTION = "json"  # csv or db or json or sqlite or excel or postgres

+# 数据保存路径,默认不指定,则保存到data文件夹下
+SAVE_DATA_PATH = ""
+
 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name

--- a/store/bilibili/bilibilli_store_media.py
+++ b/store/bilibili/bilibilli_store_media.py
@@ -28,10 +28,15 @@ import aiofiles

 from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
 from tools import utils
+import config


 class BilibiliVideo(AbstractStoreVideo):
-    video_store_path: str = "data/bili/videos"
+    def __init__(self):
+        if config.SAVE_DATA_PATH:
+            self.video_store_path = f"{config.SAVE_DATA_PATH}/bili/videos"
+        else:
+            self.video_store_path = "data/bili/videos"

    async def store_video(self, video_content_item: Dict):
        """
--- a/store/douyin/douyin_store_media.py
+++ b/store/douyin/douyin_store_media.py
@@ -24,10 +24,15 @@ import aiofiles

 from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
 from tools import utils
+import config


 class DouYinImage(AbstractStoreImage):
-    image_store_path: str = "data/douyin/images"
+    def __init__(self):
+        if config.SAVE_DATA_PATH:
+            self.image_store_path = f"{config.SAVE_DATA_PATH}/douyin/images"
+        else:
+            self.image_store_path = "data/douyin/images"

    async def store_image(self, image_content_item: Dict):
        """
@@ -74,7 +79,11 @@ class DouYinImage(AbstractStoreImage):


 class DouYinVideo(AbstractStoreVideo):
-    video_store_path: str = "data/douyin/videos"
+    def __init__(self):
+        if config.SAVE_DATA_PATH:
+            self.video_store_path = f"{config.SAVE_DATA_PATH}/douyin/videos"
+        else:
+            self.video_store_path = "data/douyin/videos"

    async def store_video(self, video_content_item: Dict):
        """
--- a/store/excel_store_base.py
+++ b/store/excel_store_base.py
@@ -46,6 +46,7 @@ except ImportError:

 from base.base_crawler import AbstractStore
 from tools import utils
+import config


 class ExcelStoreBase(AbstractStore):
@@ -111,7 +112,10 @@ class ExcelStoreBase(AbstractStore):
        self.crawler_type = crawler_type

        # Create data directory
-        self.data_dir = Path("data") / platform
+        if config.SAVE_DATA_PATH:
+            self.data_dir = Path(config.SAVE_DATA_PATH) / platform
+        else:
+            self.data_dir = Path("data") / platform
        self.data_dir.mkdir(parents=True, exist_ok=True)

        # Initialize workbook
--- a/store/weibo/weibo_store_media.py
+++ b/store/weibo/weibo_store_media.py
@@ -28,10 +28,15 @@ import aiofiles

 from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
 from tools import utils
+import config


 class WeiboStoreImage(AbstractStoreImage):
-    image_store_path: str = "data/weibo/images"
+    def __init__(self):
+        if config.SAVE_DATA_PATH:
+            self.image_store_path = f"{config.SAVE_DATA_PATH}/weibo/images"
+        else:
+            self.image_store_path = "data/weibo/images"

    async def store_image(self, image_content_item: Dict):
        """
--- a/store/xhs/xhs_store_media.py
+++ b/store/xhs/xhs_store_media.py
@@ -28,10 +28,15 @@ import aiofiles

 from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
 from tools import utils
+import config


 class XiaoHongShuImage(AbstractStoreImage):
-    image_store_path: str = "data/xhs/images"
+    def __init__(self):
+        if config.SAVE_DATA_PATH:
+            self.image_store_path = f"{config.SAVE_DATA_PATH}/xhs/images"
+        else:
+            self.image_store_path = "data/xhs/images"

    async def store_image(self, image_content_item: Dict):
        """
@@ -78,7 +83,11 @@ class XiaoHongShuImage(AbstractStoreImage):


 class XiaoHongShuVideo(AbstractStoreVideo):
-    video_store_path: str = "data/xhs/videos"
+    def __init__(self):
+        if config.SAVE_DATA_PATH:
+            self.video_store_path = f"{config.SAVE_DATA_PATH}/xhs/videos"
+        else:
+            self.video_store_path = "data/xhs/videos"

    async def store_video(self, video_content_item: Dict):
        """
--- a/tools/async_file_writer.py
+++ b/tools/async_file_writer.py
@@ -35,7 +35,10 @@ class AsyncFileWriter:
        self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None

    def _get_file_path(self, file_type: str, item_type: str) -> str:
-        base_path = f"data/{self.platform}/{file_type}"
+        if config.SAVE_DATA_PATH:
+            base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}"
+        else:
+            base_path = f"data/{self.platform}/{file_type}"
        pathlib.Path(base_path).mkdir(parents=True, exist_ok=True)
        file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}"
        return f"{base_path}/{file_name}"
@@ -113,7 +116,10 @@ class AsyncFileWriter:
                return

            # Generate wordcloud
-            words_base_path = f"data/{self.platform}/words"
+            if config.SAVE_DATA_PATH:
+                words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words"
+            else:
+                words_base_path = f"data/{self.platform}/words"
            pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
            words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"