添加了抖音存储视频以及图片的逻辑，并将config.py中ENABLE_GET_IMAGES参数更名为ENABLE_GET_MEIDAS，在此基础上略微修改存储逻辑

2026-06-09 03:17:25 +08:00 · 2025-07-30 18:24:08 +08:00
parent 417c39de69
commit 173bc08a9d
12 changed files with 631 additions and 716 deletions
--- a/store/bilibili/bilibilli_store_video.py
+++ b/store/bilibili/bilibilli_store_video.py
@@ -1,13 +1,12 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

 # -*- coding: utf-8 -*-
 # @Author  : helloteemo
@@ -18,11 +17,11 @@ from typing import Dict

 import aiofiles

-from base.base_crawler import AbstractStoreImage
+from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
 from tools import utils


-class BilibiliVideo(AbstractStoreImage):
+class BilibiliVideo(AbstractStoreVideo):
    video_store_path: str = "data/bilibili/videos"

    async def store_video(self, video_content_item: Dict):
@@ -34,8 +33,7 @@ class BilibiliVideo(AbstractStoreImage):
        Returns:

        """
-        await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"),
-                              video_content_item.get("extension_file_name"))
+        await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"), video_content_item.get("extension_file_name"))

    def make_save_file_name(self, aid: str, extension_file_name: str) -> str:
        """
--- a/store/douyin/init.py
+++ b/store/douyin/init.py
@@ -18,6 +18,7 @@ import config
 from var import source_keyword_var

 from .douyin_store_impl import *
+from .douyin_store_media import *


 class DouyinStoreFactory:
@@ -233,3 +234,33 @@ async def save_creator(user_id: str, creator: Dict):
    }
    utils.logger.info(f"[store.douyin.save_creator] creator:{local_db_item}")
    await DouyinStoreFactory.create_store().store_creator(local_db_item)
+
+
+async def update_dy_aweme_image(aweme_id, pic_content, extension_file_name):
+    """
+    更新抖音笔记图片
+    Args:
+        aweme_id:
+        pic_content:
+        extension_file_name:
+
+    Returns:
+
+    """
+
+    await DouYinImage().store_image({"aweme_id": aweme_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
+
+
+async def update_dy_aweme_video(aweme_id, video_content, extension_file_name):
+    """
+    更新抖音短视频
+    Args:
+        aweme_id:
+        video_content:
+        extension_file_name:
+
+    Returns:
+
+    """
+
+    await DouYinVideo().store_video({"aweme_id": aweme_id, "video_content": video_content, "extension_file_name": extension_file_name})
--- a/store/douyin/douyin_store_media.py
+++ b/store/douyin/douyin_store_media.py
@@ -0,0 +1,103 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import pathlib
+from typing import Dict
+
+import aiofiles
+
+from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
+from tools import utils
+
+
+class DouYinImage(AbstractStoreImage):
+    image_store_path: str = "data/douyin/images"
+
+    async def store_image(self, image_content_item: Dict):
+        """
+        store content
+        Args:
+            content_item:
+
+        Returns:
+
+        """
+        await self.save_image(image_content_item.get("aweme_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name"))
+
+    def make_save_file_name(self, aweme_id: str, extension_file_name: str) -> str:
+        """
+        make save file name by store type
+        Args:
+            aweme_id: aweme id
+            picid: image id
+
+        Returns:
+
+        """
+        return f"{self.image_store_path}/{aweme_id}/{extension_file_name}"
+
+    async def save_image(self, aweme_id: str, pic_content: str, extension_file_name):
+        """
+        save image to local
+        Args:
+            aweme_id: aweme id
+            pic_content: image content
+
+        Returns:
+
+        """
+        pathlib.Path(self.image_store_path + "/" + aweme_id).mkdir(parents=True, exist_ok=True)
+        save_file_name = self.make_save_file_name(aweme_id, extension_file_name)
+        async with aiofiles.open(save_file_name, 'wb') as f:
+            await f.write(pic_content)
+            utils.logger.info(f"[DouYinImageStoreImplement.save_image] save image {save_file_name} success ...")
+
+
+class DouYinVideo(AbstractStoreVideo):
+    video_store_path: str = "data/douyin/videos"
+
+    async def store_video(self, video_content_item: Dict):
+        """
+        store content
+        Args:
+            content_item:
+
+        Returns:
+
+        """
+        await self.save_video(video_content_item.get("aweme_id"), video_content_item.get("video_content"), video_content_item.get("extension_file_name"))
+
+    def make_save_file_name(self, aweme_id: str, extension_file_name: str) -> str:
+        """
+        make save file name by store type
+        Args:
+            aweme_id: aweme id
+            picid: image id
+
+        Returns:
+
+        """
+        return f"{self.video_store_path}/{aweme_id}/{extension_file_name}"
+
+    async def save_video(self, aweme_id: str, video_content: str, extension_file_name):
+        """
+        save video to local
+        Args:
+            aweme_id: aweme id
+            pic_content: image content
+
+        Returns:
+
+        """
+        pathlib.Path(self.video_store_path + "/" + aweme_id).mkdir(parents=True, exist_ok=True)
+        save_file_name = self.make_save_file_name(aweme_id, extension_file_name)
+        async with aiofiles.open(save_file_name, 'wb') as f:
+            await f.write(video_content)
+            utils.logger.info(f"[DouYinVideoStoreImplement.save_video] save video {save_file_name} success ...")
--- a/store/xhs/init.py
+++ b/store/xhs/init.py
@@ -1,13 +1,12 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

 # -*- coding: utf-8 -*-
 # @Author  : relakkes@gmail.com
@@ -28,7 +27,7 @@ class XhsStoreFactory:
        "csv": XhsCsvStoreImplement,
        "db": XhsDbStoreImplement,
        "json": XhsJsonStoreImplement,
-        "sqlite": XhsSqliteStoreImplement
+        "sqlite": XhsSqliteStoreImplement,
    }

    @staticmethod
@@ -88,27 +87,27 @@ async def update_xhs_note(note_item: Dict):
    video_url = ','.join(get_video_url_arr(note_item))

    local_db_item = {
-        "note_id": note_item.get("note_id"), # 帖子id
-        "type": note_item.get("type"), # 帖子类型
-        "title": note_item.get("title") or note_item.get("desc", "")[:255], # 帖子标题
-        "desc": note_item.get("desc", ""), # 帖子描述
-        "video_url": video_url, # 帖子视频url
-        "time": note_item.get("time"), # 帖子发布时间
-        "last_update_time": note_item.get("last_update_time", 0), # 帖子最后更新时间
-        "user_id": user_info.get("user_id"), # 用户id
-        "nickname": user_info.get("nickname"), # 用户昵称
-        "avatar": user_info.get("avatar"), # 用户头像
-        "liked_count": interact_info.get("liked_count"), # 点赞数
-        "collected_count": interact_info.get("collected_count"), # 收藏数
-        "comment_count": interact_info.get("comment_count"), # 评论数
-        "share_count": interact_info.get("share_count"), # 分享数
-        "ip_location": note_item.get("ip_location", ""), # ip地址
-        "image_list": ','.join([img.get('url', '') for img in image_list]), # 图片url
-        "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), # 标签
-        "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳（MediaCrawler程序生成的，主要用途在db存储的时候记录一条记录最新更新时间）
-        "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", # 帖子url
-        "source_keyword": source_keyword_var.get(), # 搜索关键词
-        "xsec_token": note_item.get("xsec_token"), # xsec_token
+        "note_id": note_item.get("note_id"),  # 帖子id
+        "type": note_item.get("type"),  # 帖子类型
+        "title": note_item.get("title") or note_item.get("desc", "")[:255],  # 帖子标题
+        "desc": note_item.get("desc", ""),  # 帖子描述
+        "video_url": video_url,  # 帖子视频url
+        "time": note_item.get("time"),  # 帖子发布时间
+        "last_update_time": note_item.get("last_update_time", 0),  # 帖子最后更新时间
+        "user_id": user_info.get("user_id"),  # 用户id
+        "nickname": user_info.get("nickname"),  # 用户昵称
+        "avatar": user_info.get("avatar"),  # 用户头像
+        "liked_count": interact_info.get("liked_count"),  # 点赞数
+        "collected_count": interact_info.get("collected_count"),  # 收藏数
+        "comment_count": interact_info.get("comment_count"),  # 评论数
+        "share_count": interact_info.get("share_count"),  # 分享数
+        "ip_location": note_item.get("ip_location", ""),  # ip地址
+        "image_list": ','.join([img.get('url', '') for img in image_list]),  # 图片url
+        "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),  # 标签
+        "last_modify_ts": utils.get_current_timestamp(),  # 最后更新时间戳（MediaCrawler程序生成的，主要用途在db存储的时候记录一条记录最新更新时间）
+        "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",  # 帖子url
+        "source_keyword": source_keyword_var.get(),  # 搜索关键词
+        "xsec_token": note_item.get("xsec_token"),  # xsec_token
    }
    utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
    await XhsStoreFactory.create_store().store_content(local_db_item)
@@ -145,18 +144,18 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
    comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
    target_comment = comment_item.get("target_comment", {})
    local_db_item = {
-        "comment_id": comment_id, # 评论id
-        "create_time": comment_item.get("create_time"), # 评论时间
-        "ip_location": comment_item.get("ip_location"), # ip地址
-        "note_id": note_id, # 帖子id
-        "content": comment_item.get("content"), # 评论内容
-        "user_id": user_info.get("user_id"), # 用户id
-        "nickname": user_info.get("nickname"), # 用户昵称
-        "avatar": user_info.get("image"), # 用户头像
-        "sub_comment_count": comment_item.get("sub_comment_count", 0), # 子评论数
-        "pictures": ",".join(comment_pictures), # 评论图片
-        "parent_comment_id": target_comment.get("id", 0), # 父评论id
-        "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳（MediaCrawler程序生成的，主要用途在db存储的时候记录一条记录最新更新时间）
+        "comment_id": comment_id,  # 评论id
+        "create_time": comment_item.get("create_time"),  # 评论时间
+        "ip_location": comment_item.get("ip_location"),  # ip地址
+        "note_id": note_id,  # 帖子id
+        "content": comment_item.get("content"),  # 评论内容
+        "user_id": user_info.get("user_id"),  # 用户id
+        "nickname": user_info.get("nickname"),  # 用户昵称
+        "avatar": user_info.get("image"),  # 用户头像
+        "sub_comment_count": comment_item.get("sub_comment_count", 0),  # 子评论数
+        "pictures": ",".join(comment_pictures),  # 评论图片
+        "parent_comment_id": target_comment.get("id", 0),  # 父评论id
+        "last_modify_ts": utils.get_current_timestamp(),  # 最后更新时间戳（MediaCrawler程序生成的，主要用途在db存储的时候记录一条记录最新更新时间）
        "like_count": comment_item.get("like_count", 0),
    }
    utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}")
@@ -197,16 +196,16 @@ async def save_creator(user_id: str, creator: Dict):
    local_db_item = {
        'user_id': user_id,  # 用户id
        'nickname': user_info.get('nickname'),  # 昵称
-        'gender':  get_gender(user_info.get('gender')), # 性别
-        'avatar': user_info.get('images'), # 头像
-        'desc': user_info.get('desc'), # 个人描述
-        'ip_location': user_info.get('ipLocation'), # ip地址
-        'follows': follows, # 关注数
+        'gender': get_gender(user_info.get('gender')),  # 性别
+        'avatar': user_info.get('images'),  # 头像
+        'desc': user_info.get('desc'),  # 个人描述
+        'ip_location': user_info.get('ipLocation'),  # ip地址
+        'follows': follows,  # 关注数
        'fans': fans,  # 粉丝数
-        'interaction': interaction, # 互动数
-        'tag_list': json.dumps({tag.get('tagType'): tag.get('name') for tag in creator.get('tags')},
-                               ensure_ascii=False), # 标签
-        "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳（MediaCrawler程序生成的，主要用途在db存储的时候记录一条记录最新更新时间）
+        'interaction': interaction,  # 互动数
+        'tag_list': json.dumps({tag.get('tagType'): tag.get('name')
+                                for tag in creator.get('tags')}, ensure_ascii=False),  # 标签
+        "last_modify_ts": utils.get_current_timestamp(),  # 最后更新时间戳（MediaCrawler程序生成的，主要用途在db存储的时候记录一条记录最新更新时间）
    }
    utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
    await XhsStoreFactory.create_store().store_creator(local_db_item)
@@ -214,7 +213,7 @@ async def save_creator(user_id: str, creator: Dict):

 async def update_xhs_note_image(note_id, pic_content, extension_file_name):
    """
-    更新小红书笔
+    更新小红书笔记图片
    Args:
        note_id:
        pic_content:
@@ -224,5 +223,19 @@ async def update_xhs_note_image(note_id, pic_content, extension_file_name):

    """

-    await XiaoHongShuImage().store_image(
-        {"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
+    await XiaoHongShuImage().store_image({"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
+
+
+async def update_xhs_note_video(note_id, video_content, extension_file_name):
+    """
+    更新小红书笔记视频
+    Args:
+        note_id:
+        video_content:
+        extension_file_name:
+
+    Returns:
+
+    """
+
+    await XiaoHongShuVideo().store_video({"notice_id": note_id, "video_content": video_content, "extension_file_name": extension_file_name})
--- a/store/xhs/xhs_store_image.py
+++ b/store/xhs/xhs_store_image.py
@@ -1,13 +1,12 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

 # -*- coding: utf-8 -*-
 # @Author  : helloteemo
@@ -18,7 +17,7 @@ from typing import Dict

 import aiofiles

-from base.base_crawler import AbstractStoreImage
+from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
 from tools import utils


@@ -34,8 +33,7 @@ class XiaoHongShuImage(AbstractStoreImage):
        Returns:

        """
-        await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"),
-                              image_content_item.get("extension_file_name"))
+        await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name"))

    def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
        """
@@ -49,7 +47,7 @@ class XiaoHongShuImage(AbstractStoreImage):
        """
        return f"{self.image_store_path}/{notice_id}/{extension_file_name}"

-    async def save_image(self, notice_id: str, pic_content: str, extension_file_name="jpg"):
+    async def save_image(self, notice_id: str, pic_content: str, extension_file_name):
        """
        save image to local
        Args:
@@ -64,3 +62,45 @@ class XiaoHongShuImage(AbstractStoreImage):
        async with aiofiles.open(save_file_name, 'wb') as f:
            await f.write(pic_content)
            utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...")
+
+
+class XiaoHongShuVideo(AbstractStoreVideo):
+    video_store_path: str = "data/xhs/videos"
+
+    async def store_video(self, video_content_item: Dict):
+        """
+        store content
+        Args:
+            content_item:
+
+        Returns:
+
+        """
+        await self.save_video(video_content_item.get("notice_id"), video_content_item.get("video_content"), video_content_item.get("extension_file_name"))
+
+    def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
+        """
+        make save file name by store type
+        Args:
+            notice_id: notice id
+
+        Returns:
+
+        """
+        return f"{self.video_store_path}/{notice_id}/{extension_file_name}"
+
+    async def save_video(self, notice_id: str, video_content: str, extension_file_name):
+        """
+        save image to local
+        Args:
+            notice_id: notice id
+            video_content: video content
+
+        Returns:
+
+        """
+        pathlib.Path(self.video_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True)
+        save_file_name = self.make_save_file_name(notice_id, extension_file_name)
+        async with aiofiles.open(save_file_name, 'wb') as f:
+            await f.write(video_content)
+            utils.logger.info(f"[XiaoHongShuVideoStoreImplement.save_video] save video {save_file_name} success ...")