MediaCrawler/store/xhs/__init__.py

# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/store/xhs/__init__.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#

# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

# -*- coding: utf-8 -*-
# @Author  : relakkes@gmail.com
# @Time    : 2024/1/14 17:34
# @Desc    :
from typing import List

import config
from var import source_keyword_var

from .xhs_store_media import *
from ._store_impl import *


class XhsStoreFactory:
    STORES = {
        "csv": XhsCsvStoreImplement,
        "db": XhsDbStoreImplement,
        "postgres": XhsDbStoreImplement,
        "json": XhsJsonStoreImplement,
        "sqlite": XhsSqliteStoreImplement,
        "mongodb": XhsMongoStoreImplement,
        "excel": XhsExcelStoreImplement,
    }

    @staticmethod
    def create_store() -> AbstractStore:
        store_class = XhsStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
        if not store_class:
            raise ValueError("[XhsStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
        return store_class()


def get_video_url_arr(note_item: Dict) -> List:
    """
    Get video url array
    Args:
        note_item:

    Returns:

    """
    if note_item.get('type') != 'video':
        return []

    video_dict = note_item.get('video')
    if not video_dict:
        return []

    videoArr = []
    consumer = video_dict.get('consumer', {})
    originVideoKey = consumer.get('origin_video_key', '')
    if originVideoKey == '':
        originVideoKey = consumer.get('originVideoKey', '')
    # Fallback with watermark
    if originVideoKey == '':
        media = video_dict.get('media', {})
        stream = media.get('stream', {})
        videos = stream.get('h264')
        if type(videos).__name__ == 'list':
            videoArr = [v.get('master_url') for v in videos]
    else:
        videoArr = [f"http://sns-video-bd.xhscdn.com/{originVideoKey}"]

    return videoArr


async def update_xhs_note(note_item: Dict):
    """
    Update Xiaohongshu note
    Args:
        note_item:

    Returns:

    """
    note_id = note_item.get("note_id")
    user_info = note_item.get("user", {})
    interact_info = note_item.get("interact_info", {})
    image_list: List[Dict] = note_item.get("image_list", [])
    tag_list: List[Dict] = note_item.get("tag_list", [])

    for img in image_list:
        if img.get('url_default') != '':
            img.update({'url': img.get('url_default')})

    video_url = ','.join(get_video_url_arr(note_item))

    local_db_item = {
        "note_id": note_item.get("note_id"),  # Note ID
        "type": note_item.get("type"),  # Note type
        "title": note_item.get("title") or note_item.get("desc", "")[:255],  # Note title
        "desc": note_item.get("desc", ""),  # Note description
        "video_url": video_url,  # Note video url
        "time": note_item.get("time"),  # Note publish time
        "last_update_time": note_item.get("last_update_time", 0),  # Note last update time
        "user_id": user_info.get("user_id"),  # User ID
        "nickname": user_info.get("nickname"),  # User nickname
        "avatar": user_info.get("avatar"),  # User avatar
        "liked_count": interact_info.get("liked_count"),  # Like count
        "collected_count": interact_info.get("collected_count"),  # Collection count
        "comment_count": interact_info.get("comment_count"),  # Comment count
        "share_count": interact_info.get("share_count"),  # Share count
        "ip_location": note_item.get("ip_location", ""),  # IP location
        "image_list": ','.join([img.get('url', '') for img in image_list]),  # Image URLs
        "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),  # Tags
        "last_modify_ts": utils.get_current_timestamp(),  # Last modification timestamp (Generated by MediaCrawler, mainly used to record the latest update time of a record in DB storage)
        "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",  # Note URL
        "source_keyword": source_keyword_var.get(),  # Search keyword
        "xsec_token": note_item.get("xsec_token"),  # xsec_token
    }
    utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
    await XhsStoreFactory.create_store().store_content(local_db_item)


async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]):
    """
    Batch update Xiaohongshu note comments
    Args:
        note_id:
        comments:

    Returns:

    """
    if not comments:
        return
    for comment_item in comments:
        await update_xhs_note_comment(note_id, comment_item)


async def update_xhs_note_comment(note_id: str, comment_item: Dict):
    """
    Update Xiaohongshu note comment
    Args:
        note_id:
        comment_item:

    Returns:

    """
    user_info = comment_item.get("user_info", {})
    comment_id = comment_item.get("id")
    comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
    target_comment = comment_item.get("target_comment", {})
    local_db_item = {
        "comment_id": comment_id,  # Comment ID
        "create_time": comment_item.get("create_time"),  # Comment time
        "ip_location": comment_item.get("ip_location"),  # IP location
        "note_id": note_id,  # Note ID
        "content": comment_item.get("content"),  # Comment content
        "user_id": user_info.get("user_id"),  # User ID
        "nickname": user_info.get("nickname"),  # User nickname
        "avatar": user_info.get("image"),  # User avatar
        "sub_comment_count": comment_item.get("sub_comment_count", 0),  # Sub-comment count
        "pictures": ",".join(comment_pictures),  # Comment pictures
        "parent_comment_id": target_comment.get("id", 0),  # Parent comment ID
        "last_modify_ts": utils.get_current_timestamp(),  # Last modification timestamp (Generated by MediaCrawler, mainly used to record the latest update time of a record in DB storage)
        "like_count": comment_item.get("like_count", 0),
    }
    utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}")
    await XhsStoreFactory.create_store().store_comment(local_db_item)


async def save_creator(user_id: str, creator: Dict):
    """
    Save Xiaohongshu creator
    Args:
        user_id:
        creator:

    Returns:

    """
    user_info = creator.get('basicInfo', {})

    follows = 0
    fans = 0
    interaction = 0
    for i in creator.get('interactions'):
        if i.get('type') == 'follows':
            follows = i.get('count')
        elif i.get('type') == 'fans':
            fans = i.get('count')
        elif i.get('type') == 'interaction':
            interaction = i.get('count')

    def get_gender(gender):
        if gender == 1:
            return 'Female'
        elif gender == 0:
            return 'Male'
        else:
            return None

    local_db_item = {
        'user_id': user_id,  # User ID
        'nickname': user_info.get('nickname'),  # Nickname
        'gender': get_gender(user_info.get('gender')),  # Gender
        'avatar': user_info.get('images'),  # Avatar
        'desc': user_info.get('desc'),  # Personal description
        'ip_location': user_info.get('ipLocation'),  # IP location
        'follows': follows,  # Following count
        'fans': fans,  # Fans count
        'interaction': interaction,  # Interaction count
        'tag_list': json.dumps({tag.get('tagType'): tag.get('name')
                                for tag in creator.get('tags')}, ensure_ascii=False),  # Tags
        "last_modify_ts": utils.get_current_timestamp(),  # Last modification timestamp (Generated by MediaCrawler, mainly used to record the latest update time of a record in DB storage)
    }
    utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
    await XhsStoreFactory.create_store().store_creator(local_db_item)


async def update_xhs_note_image(note_id, pic_content, extension_file_name):
    """
    Update Xiaohongshu note image
    Args:
        note_id:
        pic_content:
        extension_file_name:

    Returns:

    """

    await XiaoHongShuImage().store_image({"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})


async def update_xhs_note_video(note_id, video_content, extension_file_name):
    """
    Update Xiaohongshu note video
    Args:
        note_id:
        video_content:
        extension_file_name:

    Returns:

    """

    await XiaoHongShuVideo().store_video({"notice_id": note_id, "video_content": video_content, "extension_file_name": extension_file_name})