mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-27 21:17:27 +08:00
Merge pull request #825 from ouzhuowei/add_save_data_path
新增数据保存路径,默认不指定则保存到data文件夹下
This commit is contained in:
@@ -266,6 +266,14 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
|||||||
rich_help_panel="Performance Configuration",
|
rich_help_panel="Performance Configuration",
|
||||||
),
|
),
|
||||||
] = config.MAX_CONCURRENCY_NUM,
|
] = config.MAX_CONCURRENCY_NUM,
|
||||||
|
save_data_path: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Option(
|
||||||
|
"--save_data_path",
|
||||||
|
help="Data save path, default is empty and will save to data folder",
|
||||||
|
rich_help_panel="Storage Configuration",
|
||||||
|
),
|
||||||
|
] = config.SAVE_DATA_PATH,
|
||||||
) -> SimpleNamespace:
|
) -> SimpleNamespace:
|
||||||
"""MediaCrawler 命令行入口"""
|
"""MediaCrawler 命令行入口"""
|
||||||
|
|
||||||
@@ -292,6 +300,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
|||||||
config.COOKIES = cookies
|
config.COOKIES = cookies
|
||||||
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
|
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_count_singlenotes
|
||||||
config.MAX_CONCURRENCY_NUM = max_concurrency_num
|
config.MAX_CONCURRENCY_NUM = max_concurrency_num
|
||||||
|
config.SAVE_DATA_PATH = save_data_path
|
||||||
|
|
||||||
# Set platform-specific ID lists for detail/creator mode
|
# Set platform-specific ID lists for detail/creator mode
|
||||||
if specified_id_list:
|
if specified_id_list:
|
||||||
|
|||||||
@@ -73,6 +73,9 @@ AUTO_CLOSE_BROWSER = True
|
|||||||
# 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。
|
# 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。
|
||||||
SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres
|
SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres
|
||||||
|
|
||||||
|
# 数据保存路径,默认不指定,则保存到data文件夹下
|
||||||
|
SAVE_DATA_PATH = ""
|
||||||
|
|
||||||
# 用户浏览器缓存的浏览器文件配置
|
# 用户浏览器缓存的浏览器文件配置
|
||||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
|
|
||||||
|
|||||||
@@ -28,10 +28,15 @@ import aiofiles
|
|||||||
|
|
||||||
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
class BilibiliVideo(AbstractStoreVideo):
|
class BilibiliVideo(AbstractStoreVideo):
|
||||||
video_store_path: str = "data/bili/videos"
|
def __init__(self):
|
||||||
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.video_store_path = f"{config.SAVE_DATA_PATH}/bili/videos"
|
||||||
|
else:
|
||||||
|
self.video_store_path = "data/bili/videos"
|
||||||
|
|
||||||
async def store_video(self, video_content_item: Dict):
|
async def store_video(self, video_content_item: Dict):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -24,10 +24,15 @@ import aiofiles
|
|||||||
|
|
||||||
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
class DouYinImage(AbstractStoreImage):
|
class DouYinImage(AbstractStoreImage):
|
||||||
image_store_path: str = "data/douyin/images"
|
def __init__(self):
|
||||||
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.image_store_path = f"{config.SAVE_DATA_PATH}/douyin/images"
|
||||||
|
else:
|
||||||
|
self.image_store_path = "data/douyin/images"
|
||||||
|
|
||||||
async def store_image(self, image_content_item: Dict):
|
async def store_image(self, image_content_item: Dict):
|
||||||
"""
|
"""
|
||||||
@@ -74,7 +79,11 @@ class DouYinImage(AbstractStoreImage):
|
|||||||
|
|
||||||
|
|
||||||
class DouYinVideo(AbstractStoreVideo):
|
class DouYinVideo(AbstractStoreVideo):
|
||||||
video_store_path: str = "data/douyin/videos"
|
def __init__(self):
|
||||||
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.video_store_path = f"{config.SAVE_DATA_PATH}/douyin/videos"
|
||||||
|
else:
|
||||||
|
self.video_store_path = "data/douyin/videos"
|
||||||
|
|
||||||
async def store_video(self, video_content_item: Dict):
|
async def store_video(self, video_content_item: Dict):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ except ImportError:
|
|||||||
|
|
||||||
from base.base_crawler import AbstractStore
|
from base.base_crawler import AbstractStore
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
class ExcelStoreBase(AbstractStore):
|
class ExcelStoreBase(AbstractStore):
|
||||||
@@ -111,7 +112,10 @@ class ExcelStoreBase(AbstractStore):
|
|||||||
self.crawler_type = crawler_type
|
self.crawler_type = crawler_type
|
||||||
|
|
||||||
# Create data directory
|
# Create data directory
|
||||||
self.data_dir = Path("data") / platform
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.data_dir = Path(config.SAVE_DATA_PATH) / platform
|
||||||
|
else:
|
||||||
|
self.data_dir = Path("data") / platform
|
||||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Initialize workbook
|
# Initialize workbook
|
||||||
|
|||||||
@@ -28,10 +28,15 @@ import aiofiles
|
|||||||
|
|
||||||
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
class WeiboStoreImage(AbstractStoreImage):
|
class WeiboStoreImage(AbstractStoreImage):
|
||||||
image_store_path: str = "data/weibo/images"
|
def __init__(self):
|
||||||
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.image_store_path = f"{config.SAVE_DATA_PATH}/weibo/images"
|
||||||
|
else:
|
||||||
|
self.image_store_path = "data/weibo/images"
|
||||||
|
|
||||||
async def store_image(self, image_content_item: Dict):
|
async def store_image(self, image_content_item: Dict):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -28,10 +28,15 @@ import aiofiles
|
|||||||
|
|
||||||
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
class XiaoHongShuImage(AbstractStoreImage):
|
class XiaoHongShuImage(AbstractStoreImage):
|
||||||
image_store_path: str = "data/xhs/images"
|
def __init__(self):
|
||||||
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.image_store_path = f"{config.SAVE_DATA_PATH}/xhs/images"
|
||||||
|
else:
|
||||||
|
self.image_store_path = "data/xhs/images"
|
||||||
|
|
||||||
async def store_image(self, image_content_item: Dict):
|
async def store_image(self, image_content_item: Dict):
|
||||||
"""
|
"""
|
||||||
@@ -78,7 +83,11 @@ class XiaoHongShuImage(AbstractStoreImage):
|
|||||||
|
|
||||||
|
|
||||||
class XiaoHongShuVideo(AbstractStoreVideo):
|
class XiaoHongShuVideo(AbstractStoreVideo):
|
||||||
video_store_path: str = "data/xhs/videos"
|
def __init__(self):
|
||||||
|
if config.SAVE_DATA_PATH:
|
||||||
|
self.video_store_path = f"{config.SAVE_DATA_PATH}/xhs/videos"
|
||||||
|
else:
|
||||||
|
self.video_store_path = "data/xhs/videos"
|
||||||
|
|
||||||
async def store_video(self, video_content_item: Dict):
|
async def store_video(self, video_content_item: Dict):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -35,7 +35,10 @@ class AsyncFileWriter:
|
|||||||
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
|
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
|
||||||
|
|
||||||
def _get_file_path(self, file_type: str, item_type: str) -> str:
|
def _get_file_path(self, file_type: str, item_type: str) -> str:
|
||||||
base_path = f"data/{self.platform}/{file_type}"
|
if config.SAVE_DATA_PATH:
|
||||||
|
base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/{file_type}"
|
||||||
|
else:
|
||||||
|
base_path = f"data/{self.platform}/{file_type}"
|
||||||
pathlib.Path(base_path).mkdir(parents=True, exist_ok=True)
|
pathlib.Path(base_path).mkdir(parents=True, exist_ok=True)
|
||||||
file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}"
|
file_name = f"{self.crawler_type}_{item_type}_{utils.get_current_date()}.{file_type}"
|
||||||
return f"{base_path}/{file_name}"
|
return f"{base_path}/{file_name}"
|
||||||
@@ -113,7 +116,10 @@ class AsyncFileWriter:
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Generate wordcloud
|
# Generate wordcloud
|
||||||
words_base_path = f"data/{self.platform}/words"
|
if config.SAVE_DATA_PATH:
|
||||||
|
words_base_path = f"{config.SAVE_DATA_PATH}/{self.platform}/words"
|
||||||
|
else:
|
||||||
|
words_base_path = f"data/{self.platform}/words"
|
||||||
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
|
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
|
||||||
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
|
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user