From 7484156f0275a78f6bb11237f95be8189d9bcfca Mon Sep 17 00:00:00 2001 From: ouzhuowei <190020754@qq.com> Date: Tue, 3 Feb 2026 11:24:22 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E8=B7=AF=E5=BE=84,=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E4=B8=8D=E6=8C=87=E5=AE=9A=E5=88=99=E4=BF=9D=E5=AD=98=E5=88=B0?= =?UTF-8?q?data=E6=96=87=E4=BB=B6=E5=A4=B9=E4=B8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: ouzhuowei <190020754@qq.com> --- cmd_arg/arg.py | 9 +++++++++ config/base_config.py | 3 +++ tools/async_file_writer.py | 10 ++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 266dd84..47b53ff 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Performance Configuration", ), ] = config.MAX_CONCURRENCY_NUM, + save_data_path: Annotated[ + str, + typer.Option( + "--save_data_path", + help="Data save path, default is empty and will save to data folder", + rich_help_panel="Storage Configuration", + ), + ] = config.SAVE_DATA_PATH, ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.COOKIES = cookies config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes config.MAX_CONCURRENCY_NUM = max_concurrency_num + config.SAVE_DATA_PATH = save_data_path # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/config/base_config.py b/config/base_config.py index 961d216..88a3a6c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True # 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。 SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres +# 数据保存路径,默认不指定,则保存到data文件夹下 +SAVE_DATA_PATH = "" + # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py index dada0d6..a52e284 100644 --- a/tools/async_file_writer.py +++ b/tools/async_file_writer.py @@ -35,7 +35,10 @@ class AsyncFileWriter: self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None def _get_file_path(self, file_type: str, item_type: str) -> str: - base_path = f"data/{self.platform}/{file_type}" + if config.SAVE_DATA_PATH: + base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}" + else: + base_path = f"data/{self.platform}/{file_type}" pathlib.Path(base_path).mkdir(parents=True, exist_ok=True) file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}" return f"{base_path}/{file_name}" @@ -113,7 +116,10 @@ class AsyncFileWriter: return # Generate wordcloud - words_base_path = f"data/{self.platform}/words" + if config.SAVE_DATA_PATH: + words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words" + else: + words_base_path = f"data/{self.platform}/words" pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True) words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}" From 2a0d1fd69f2b347a5be55339366f251802f80a39 Mon Sep 17 00:00:00 2001 From: ouzhuowei <190020754@qq.com> Date: Wed, 4 Feb 2026 09:48:39 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E8=A1=A5=E5=85=85=E5=90=84=E5=B9=B3?= =?UTF-8?q?=E5=8F=B0=E7=9A=84=E5=AA=92=E4=BD=93=E5=AD=98=E5=82=A8=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E8=B7=AF=E5=BE=84=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: ouzhuowei <190020754@qq.com> --- store/bilibili/bilibilli_store_media.py | 7 ++++++- store/douyin/douyin_store_media.py | 13 +++++++++++-- store/excel_store_base.py | 6 +++++- store/weibo/weibo_store_media.py | 7 ++++++- store/xhs/xhs_store_media.py | 13 +++++++++++-- 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/store/bilibili/bilibilli_store_media.py b/store/bilibili/bilibilli_store_media.py index 5019781..b12048a 100644 --- a/store/bilibili/bilibilli_store_media.py +++ b/store/bilibili/bilibilli_store_media.py @@ -28,10 +28,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class BilibiliVideo(AbstractStoreVideo): - video_store_path: str = "data/bili/videos" + def __init__(self): + if config.SAVE_DATA_PATH: + self.video_store_path = f"{config.SAVE_DATA_PATH}/bili/videos" + else: + self.video_store_path = "data/bili/videos" async def store_video(self, video_content_item: Dict): """ diff --git a/store/douyin/douyin_store_media.py b/store/douyin/douyin_store_media.py index e4fbb83..baebf33 100644 --- a/store/douyin/douyin_store_media.py +++ b/store/douyin/douyin_store_media.py @@ -24,10 +24,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class DouYinImage(AbstractStoreImage): - image_store_path: str = "data/douyin/images" + def __init__(self): + if config.SAVE_DATA_PATH: + self.image_store_path = f"{config.SAVE_DATA_PATH}/douyin/images" + else: + self.image_store_path = "data/douyin/images" async def store_image(self, image_content_item: Dict): """ @@ -74,7 +79,11 @@ class DouYinImage(AbstractStoreImage): class DouYinVideo(AbstractStoreVideo): - video_store_path: str = "data/douyin/videos" + def __init__(self): + if config.SAVE_DATA_PATH: + self.video_store_path = f"{config.SAVE_DATA_PATH}/douyin/videos" + else: + self.video_store_path = "data/douyin/videos" async def store_video(self, video_content_item: Dict): """ diff --git a/store/excel_store_base.py b/store/excel_store_base.py index 052810e..a5bc438 100644 --- a/store/excel_store_base.py +++ b/store/excel_store_base.py @@ -46,6 +46,7 @@ except ImportError: from base.base_crawler import AbstractStore from tools import utils +import config class ExcelStoreBase(AbstractStore): @@ -111,7 +112,10 @@ class ExcelStoreBase(AbstractStore): self.crawler_type = crawler_type # Create data directory - self.data_dir = Path("data") / platform + if config.SAVE_DATA_PATH: + self.data_dir = Path(config.SAVE_DATA_PATH) / platform + else: + self.data_dir = Path("data") / platform self.data_dir.mkdir(parents=True, exist_ok=True) # Initialize workbook diff --git a/store/weibo/weibo_store_media.py b/store/weibo/weibo_store_media.py index 671fb7c..c38dd5a 100644 --- a/store/weibo/weibo_store_media.py +++ b/store/weibo/weibo_store_media.py @@ -28,10 +28,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class WeiboStoreImage(AbstractStoreImage): - image_store_path: str = "data/weibo/images" + def __init__(self): + if config.SAVE_DATA_PATH: + self.image_store_path = f"{config.SAVE_DATA_PATH}/weibo/images" + else: + self.image_store_path = "data/weibo/images" async def store_image(self, image_content_item: Dict): """ diff --git a/store/xhs/xhs_store_media.py b/store/xhs/xhs_store_media.py index 631bd04..c38a812 100644 --- a/store/xhs/xhs_store_media.py +++ b/store/xhs/xhs_store_media.py @@ -28,10 +28,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class XiaoHongShuImage(AbstractStoreImage): - image_store_path: str = "data/xhs/images" + def __init__(self): + if config.SAVE_DATA_PATH: + self.image_store_path = f"{config.SAVE_DATA_PATH}/xhs/images" + else: + self.image_store_path = "data/xhs/images" async def store_image(self, image_content_item: Dict): """ @@ -78,7 +83,11 @@ class XiaoHongShuImage(AbstractStoreImage): class XiaoHongShuVideo(AbstractStoreVideo): - video_store_path: str = "data/xhs/videos" + def __init__(self): + if config.SAVE_DATA_PATH: + self.video_store_path = f"{config.SAVE_DATA_PATH}/xhs/videos" + else: + self.video_store_path = "data/xhs/videos" async def store_video(self, video_content_item: Dict): """