diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 266dd84..47b53ff 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Performance Configuration", ), ] = config.MAX_CONCURRENCY_NUM, + save_data_path: Annotated[ + str, + typer.Option( + "--save_data_path", + help="Data save path, default is empty and will save to data folder", + rich_help_panel="Storage Configuration", + ), + ] = config.SAVE_DATA_PATH, ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.COOKIES = cookies config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes config.MAX_CONCURRENCY_NUM = max_concurrency_num + config.SAVE_DATA_PATH = save_data_path # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/config/base_config.py b/config/base_config.py index 961d216..88a3a6c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True # 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。 SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres +# 数据保存路径,默认不指定,则保存到data文件夹下 +SAVE_DATA_PATH = "" + # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/store/bilibili/bilibilli_store_media.py b/store/bilibili/bilibilli_store_media.py index 5019781..b12048a 100644 --- a/store/bilibili/bilibilli_store_media.py +++ b/store/bilibili/bilibilli_store_media.py @@ -28,10 +28,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class BilibiliVideo(AbstractStoreVideo): - video_store_path: str = "data/bili/videos" + def __init__(self): + if config.SAVE_DATA_PATH: + self.video_store_path = f"{config.SAVE_DATA_PATH}/bili/videos" + else: + self.video_store_path = "data/bili/videos" async def store_video(self, video_content_item: Dict): """ diff --git a/store/douyin/douyin_store_media.py b/store/douyin/douyin_store_media.py index e4fbb83..baebf33 100644 --- a/store/douyin/douyin_store_media.py +++ b/store/douyin/douyin_store_media.py @@ -24,10 +24,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class DouYinImage(AbstractStoreImage): - image_store_path: str = "data/douyin/images" + def __init__(self): + if config.SAVE_DATA_PATH: + self.image_store_path = f"{config.SAVE_DATA_PATH}/douyin/images" + else: + self.image_store_path = "data/douyin/images" async def store_image(self, image_content_item: Dict): """ @@ -74,7 +79,11 @@ class DouYinImage(AbstractStoreImage): class DouYinVideo(AbstractStoreVideo): - video_store_path: str = "data/douyin/videos" + def __init__(self): + if config.SAVE_DATA_PATH: + self.video_store_path = f"{config.SAVE_DATA_PATH}/douyin/videos" + else: + self.video_store_path = "data/douyin/videos" async def store_video(self, video_content_item: Dict): """ diff --git a/store/excel_store_base.py b/store/excel_store_base.py index 052810e..a5bc438 100644 --- a/store/excel_store_base.py +++ b/store/excel_store_base.py @@ -46,6 +46,7 @@ except ImportError: from base.base_crawler import AbstractStore from tools import utils +import config class ExcelStoreBase(AbstractStore): @@ -111,7 +112,10 @@ class ExcelStoreBase(AbstractStore): self.crawler_type = crawler_type # Create data directory - self.data_dir = Path("data") / platform + if config.SAVE_DATA_PATH: + self.data_dir = Path(config.SAVE_DATA_PATH) / platform + else: + self.data_dir = Path("data") / platform self.data_dir.mkdir(parents=True, exist_ok=True) # Initialize workbook diff --git a/store/weibo/weibo_store_media.py b/store/weibo/weibo_store_media.py index 671fb7c..c38dd5a 100644 --- a/store/weibo/weibo_store_media.py +++ b/store/weibo/weibo_store_media.py @@ -28,10 +28,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class WeiboStoreImage(AbstractStoreImage): - image_store_path: str = "data/weibo/images" + def __init__(self): + if config.SAVE_DATA_PATH: + self.image_store_path = f"{config.SAVE_DATA_PATH}/weibo/images" + else: + self.image_store_path = "data/weibo/images" async def store_image(self, image_content_item: Dict): """ diff --git a/store/xhs/xhs_store_media.py b/store/xhs/xhs_store_media.py index 631bd04..c38a812 100644 --- a/store/xhs/xhs_store_media.py +++ b/store/xhs/xhs_store_media.py @@ -28,10 +28,15 @@ import aiofiles from base.base_crawler import AbstractStoreImage, AbstractStoreVideo from tools import utils +import config class XiaoHongShuImage(AbstractStoreImage): - image_store_path: str = "data/xhs/images" + def __init__(self): + if config.SAVE_DATA_PATH: + self.image_store_path = f"{config.SAVE_DATA_PATH}/xhs/images" + else: + self.image_store_path = "data/xhs/images" async def store_image(self, image_content_item: Dict): """ @@ -78,7 +83,11 @@ class XiaoHongShuImage(AbstractStoreImage): class XiaoHongShuVideo(AbstractStoreVideo): - video_store_path: str = "data/xhs/videos" + def __init__(self): + if config.SAVE_DATA_PATH: + self.video_store_path = f"{config.SAVE_DATA_PATH}/xhs/videos" + else: + self.video_store_path = "data/xhs/videos" async def store_video(self, video_content_item: Dict): """ diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py index dada0d6..a52e284 100644 --- a/tools/async_file_writer.py +++ b/tools/async_file_writer.py @@ -35,7 +35,10 @@ class AsyncFileWriter: self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None def _get_file_path(self, file_type: str, item_type: str) -> str: - base_path = f"data/{self.platform}/{file_type}" + if config.SAVE_DATA_PATH: + base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}" + else: + base_path = f"data/{self.platform}/{file_type}" pathlib.Path(base_path).mkdir(parents=True, exist_ok=True) file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}" return f"{base_path}/{file_name}" @@ -113,7 +116,10 @@ class AsyncFileWriter: return # Generate wordcloud - words_base_path = f"data/{self.platform}/words" + if config.SAVE_DATA_PATH: + words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words" + else: + words_base_path = f"data/{self.platform}/words" pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True) words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"