MediaCrawler/tests/test_excel_store.py

# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tests/test_excel_store.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

"""
Unit tests for Excel export functionality
"""

import pytest
import asyncio
import os
from pathlib import Path
import tempfile
import shutil

try:
    import openpyxl
    EXCEL_AVAILABLE = True
except ImportError:
    EXCEL_AVAILABLE = False

from store.excel_store_base import ExcelStoreBase


@pytest.mark.skipif(not EXCEL_AVAILABLE, reason="openpyxl not installed")
class TestExcelStoreBase:
    """Test cases for ExcelStoreBase"""

    @pytest.fixture(autouse=True)
    def clear_singleton_state(self):
        """Clear singleton state before and after each test"""
        ExcelStoreBase._instances.clear()
        yield
        ExcelStoreBase._instances.clear()

    @pytest.fixture
    def temp_dir(self):
        """Create temporary directory for test files"""
        temp_path = tempfile.mkdtemp()
        yield temp_path
        # Cleanup
        shutil.rmtree(temp_path, ignore_errors=True)

    @pytest.fixture
    def excel_store(self, temp_dir, monkeypatch):
        """Create ExcelStoreBase instance for testing"""
        # Monkey patch data directory
        monkeypatch.chdir(temp_dir)
        store = ExcelStoreBase(platform="test", crawler_type="search")
        yield store
        # Cleanup is handled by temp_dir fixture

    def test_initialization(self, excel_store):
        """Test Excel store initialization"""
        assert excel_store.platform == "test"
        assert excel_store.crawler_type == "search"
        assert excel_store.workbook is not None
        assert excel_store.contents_sheet is not None
        assert excel_store.comments_sheet is not None
        assert excel_store.creators_sheet is not None

    @pytest.mark.asyncio
    async def test_store_content(self, excel_store):
        """Test storing content data"""
        content_item = {
            "note_id": "test123",
            "title": "Test Title",
            "desc": "Test Description",
            "user_id": "user456",
            "nickname": "TestUser",
            "liked_count": 100,
            "comment_count": 50
        }

        await excel_store.store_content(content_item)

        # Verify data was written
        assert excel_store.contents_sheet.max_row == 2  # Header + 1 data row
        assert excel_store.contents_headers_written is True

    @pytest.mark.asyncio
    async def test_store_comment(self, excel_store):
        """Test storing comment data"""
        comment_item = {
            "comment_id": "comment123",
            "note_id": "note456",
            "content": "Great post!",
            "user_id": "user789",
            "nickname": "Commenter",
            "like_count": 10
        }

        await excel_store.store_comment(comment_item)

        # Verify data was written
        assert excel_store.comments_sheet.max_row == 2  # Header + 1 data row
        assert excel_store.comments_headers_written is True

    @pytest.mark.asyncio
    async def test_store_creator(self, excel_store):
        """Test storing creator data"""
        creator_item = {
            "user_id": "creator123",
            "nickname": "Creator Name",
            "fans": 10000,
            "follows": 500,
            "interaction": 50000
        }

        await excel_store.store_creator(creator_item)

        # Verify data was written
        assert excel_store.creators_sheet.max_row == 2  # Header + 1 data row
        assert excel_store.creators_headers_written is True

    @pytest.mark.asyncio
    async def test_multiple_items(self, excel_store):
        """Test storing multiple items"""
        # Store multiple content items
        for i in range(5):
            await excel_store.store_content({
                "note_id": f"note{i}",
                "title": f"Title {i}",
                "liked_count": i * 10
            })

        # Verify all items were stored
        assert excel_store.contents_sheet.max_row == 6  # Header + 5 data rows

    def test_flush(self, excel_store):
        """Test flushing data to file"""
        # Add some test data
        asyncio.run(excel_store.store_content({
            "note_id": "test",
            "title": "Test"
        }))

        # Flush to file
        excel_store.flush()

        # Verify file was created
        assert excel_store.filename.exists()

        # Verify file can be opened
        wb = openpyxl.load_workbook(excel_store.filename)
        assert "Contents" in wb.sheetnames
        wb.close()

    def test_header_formatting(self, excel_store):
        """Test header row formatting"""
        asyncio.run(excel_store.store_content({"note_id": "test", "title": "Test"}))

        # Check header formatting
        header_cell = excel_store.contents_sheet.cell(row=1, column=1)
        assert header_cell.font.bold is True
        # RGB color may have different prefix (00 or FF), check the actual color part
        assert header_cell.fill.start_color.rgb[-6:] == "366092"

    def test_empty_sheets_removed(self, excel_store):
        """Test that empty sheets are removed on flush"""
        # Only add content, leave comments and creators empty
        asyncio.run(excel_store.store_content({"note_id": "test"}))

        excel_store.flush()

        # Reload workbook
        wb = openpyxl.load_workbook(excel_store.filename)

        # Only Contents sheet should exist
        assert "Contents" in wb.sheetnames
        assert "Comments" not in wb.sheetnames
        assert "Creators" not in wb.sheetnames
        wb.close()


@pytest.mark.skipif(not EXCEL_AVAILABLE, reason="openpyxl not installed")
def test_excel_import_availability():
    """Test that openpyxl is available"""
    assert EXCEL_AVAILABLE is True
    import openpyxl
    assert openpyxl is not None


@pytest.mark.skipif(not EXCEL_AVAILABLE, reason="openpyxl not installed")
class TestSingletonPattern:
    """Test singleton pattern for Excel store"""

    @pytest.fixture(autouse=True)
    def setup_and_teardown(self, tmp_path, monkeypatch):
        """Setup and teardown for each test"""
        # Change to temp directory
        monkeypatch.chdir(tmp_path)
        # Clear singleton instances before each test
        ExcelStoreBase._instances.clear()
        yield
        # Cleanup after test
        ExcelStoreBase._instances.clear()

    def test_get_instance_returns_same_instance(self):
        """Test that get_instance returns the same instance for same parameters"""
        instance1 = ExcelStoreBase.get_instance("xhs", "search")
        instance2 = ExcelStoreBase.get_instance("xhs", "search")

        assert instance1 is instance2

    def test_get_instance_different_params_returns_different_instances(self):
        """Test that different parameters return different instances"""
        instance1 = ExcelStoreBase.get_instance("xhs", "search")
        instance2 = ExcelStoreBase.get_instance("xhs", "detail")
        instance3 = ExcelStoreBase.get_instance("douyin", "search")

        assert instance1 is not instance2
        assert instance1 is not instance3
        assert instance2 is not instance3

    @pytest.mark.asyncio
    async def test_singleton_preserves_data(self):
        """Test that singleton pattern preserves data across multiple calls"""
        # First call - store some content
        store1 = ExcelStoreBase.get_instance("test", "search")
        await store1.store_content({"note_id": "note1", "title": "Title 1"})

        # Second call - should get same instance with data
        store2 = ExcelStoreBase.get_instance("test", "search")
        await store2.store_content({"note_id": "note2", "title": "Title 2"})

        # Verify both items are in the same workbook
        assert store1 is store2
        assert store1.contents_sheet.max_row == 3  # Header + 2 data rows

    def test_flush_all_saves_all_instances(self, tmp_path):
        """Test that flush_all saves all instances"""
        # Create multiple instances
        store1 = ExcelStoreBase.get_instance("platform1", "search")
        store2 = ExcelStoreBase.get_instance("platform2", "search")

        # Add data to each
        asyncio.run(store1.store_content({"note_id": "note1"}))
        asyncio.run(store2.store_content({"note_id": "note2"}))

        # Flush all
        ExcelStoreBase.flush_all()

        # Verify instances are cleared
        assert len(ExcelStoreBase._instances) == 0

        # Verify files were created
        assert store1.filename.exists()
        assert store2.filename.exists()

    def test_flush_all_clears_instances(self):
        """Test that flush_all clears the instances dictionary"""
        # Create an instance
        ExcelStoreBase.get_instance("test", "search")
        assert len(ExcelStoreBase._instances) == 1

        # Flush all
        ExcelStoreBase.flush_all()

        # Verify instances are cleared
        assert len(ExcelStoreBase._instances) == 0