diff --git a/.DS_Store b/.DS_Store index 1a71cb0..b3ead41 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/README.md b/README.md index c94954d..bced90e 100644 --- a/README.md +++ b/README.md @@ -228,14 +228,15 @@ python main.py --help ### 使用示例: ```shell -# 使用 Excel 存储数据(推荐用于数据分析)✨ 新功能 -uv run main.py --platform xhs --lt qrcode --type search --save_data_option excel - -# 初始化 SQLite 数据库(使用'--init_db'时不需要携带其他optional) +# 初始化 SQLite 数据库 uv run main.py --init_db sqlite -# 使用 SQLite 存储数据(推荐个人用户使用) +# 使用 SQLite 存储数据 uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite + +# 使用 Excel 存储数据(推荐用于数据分析) +uv run main.py --platform xhs --lt qrcode --type search --save_data_option excel ``` + ```shell # 初始化 MySQL 数据库 uv run main.py --init_db mysql diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 4f3bca6..fc8d53d 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -71,6 +71,8 @@ class SaveDataOptionEnum(str, Enum): DB = "db" JSON = "json" SQLITE = "sqlite" + MONGODB = "mongodb" + EXCEL = "excel" class InitDbOptionEnum(str, Enum): @@ -199,7 +201,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): SaveDataOptionEnum, typer.Option( "--save_data_option", - help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)", + help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库 | mongodb=MongoDB数据库 | excel=Excel文件)", rich_help_panel="存储配置", ), ] = _coerce_enum( diff --git a/main.py b/main.py index cfdd7b0..4bf2268 100644 --- a/main.py +++ b/main.py @@ -87,14 +87,11 @@ async def main(): # Flush Excel data if using Excel export if config.SAVE_DATA_OPTION == "excel": try: - # Get the store instance and flush data - from store.xhs import XhsStoreFactory - store = XhsStoreFactory.create_store() - if hasattr(store, 'flush'): - store.flush() - print(f"[Main] Excel file saved successfully") + from store.excel_store_base import ExcelStoreBase + ExcelStoreBase.flush_all() + print("[Main] Excel files saved successfully") except Exception as e: - print(f"Error flushing Excel data: {e}") + print(f"[Main] Error flushing Excel data: {e}") # Generate wordcloud after crawling is complete # Only for JSON save mode diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index afe9026..06faa65 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -38,13 +38,14 @@ class BiliStoreFactory: "json": BiliJsonStoreImplement, "sqlite": BiliSqliteStoreImplement, "mongodb": BiliMongoStoreImplement, + "excel": BiliExcelStoreImplement, } @staticmethod def create_store() -> AbstractStore: store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION) if not store_class: - raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...") + raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...") return store_class() diff --git a/store/bilibili/_store_impl.py b/store/bilibili/_store_impl.py index c5838d3..cae8bbe 100644 --- a/store/bilibili/_store_impl.py +++ b/store/bilibili/_store_impl.py @@ -365,3 +365,14 @@ class BiliMongoStoreImplement(AbstractStore): data=creator_item ) utils.logger.info(f"[BiliMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") + + +class BiliExcelStoreImplement: + """B站Excel存储实现 - 全局单例""" + + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="bilibili", + crawler_type=crawler_type_var.get() + ) diff --git a/store/douyin/__init__.py b/store/douyin/__init__.py index f1accbe..7ddd2e1 100644 --- a/store/douyin/__init__.py +++ b/store/douyin/__init__.py @@ -37,13 +37,14 @@ class DouyinStoreFactory: "json": DouyinJsonStoreImplement, "sqlite": DouyinSqliteStoreImplement, "mongodb": DouyinMongoStoreImplement, + "excel": DouyinExcelStoreImplement, } @staticmethod def create_store() -> AbstractStore: store_class = DouyinStoreFactory.STORES.get(config.SAVE_DATA_OPTION) if not store_class: - raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...") + raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...") return store_class() diff --git a/store/douyin/_store_impl.py b/store/douyin/_store_impl.py index daac93b..dbf7c01 100644 --- a/store/douyin/_store_impl.py +++ b/store/douyin/_store_impl.py @@ -264,3 +264,14 @@ class DouyinMongoStoreImplement(AbstractStore): data=creator_item ) utils.logger.info(f"[DouyinMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") + + +class DouyinExcelStoreImplement: + """抖音Excel存储实现 - 全局单例""" + + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="douyin", + crawler_type=crawler_type_var.get() + ) diff --git a/store/excel_store_base.py b/store/excel_store_base.py index 324383f..052810e 100644 --- a/store/excel_store_base.py +++ b/store/excel_store_base.py @@ -2,10 +2,20 @@ # Copyright (c) 2025 relakkes@gmail.com # # This file is part of MediaCrawler project. -# Repository: https://github.com/NanmiCoder/MediaCrawler +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/store/excel_store_base.py # GitHub: https://github.com/NanmiCoder # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 # +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + # 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: # 1. 不得用于任何商业用途。 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 @@ -21,7 +31,7 @@ Excel Store Base Implementation Provides Excel export functionality for crawled data with formatted sheets """ -import os +import threading from datetime import datetime from typing import Dict, List, Any from pathlib import Path @@ -42,12 +52,50 @@ class ExcelStoreBase(AbstractStore): """ Base class for Excel storage implementation Provides formatted Excel export with multiple sheets for contents, comments, and creators + Uses singleton pattern to maintain state across multiple store calls """ - + + # Class-level singleton management + _instances: Dict[str, "ExcelStoreBase"] = {} + _lock = threading.Lock() + + @classmethod + def get_instance(cls, platform: str, crawler_type: str) -> "ExcelStoreBase": + """ + Get or create a singleton instance for the given platform and crawler type + + Args: + platform: Platform name (xhs, dy, ks, etc.) + crawler_type: Type of crawler (search, detail, creator) + + Returns: + ExcelStoreBase instance + """ + key = f"{platform}_{crawler_type}" + with cls._lock: + if key not in cls._instances: + cls._instances[key] = cls(platform, crawler_type) + return cls._instances[key] + + @classmethod + def flush_all(cls): + """ + Flush all Excel store instances and save to files + Should be called at the end of crawler execution + """ + with cls._lock: + for key, instance in cls._instances.items(): + try: + instance.flush() + utils.logger.info(f"[ExcelStoreBase] Flushed instance: {key}") + except Exception as e: + utils.logger.error(f"[ExcelStoreBase] Error flushing {key}: {e}") + cls._instances.clear() + def __init__(self, platform: str, crawler_type: str = "search"): """ Initialize Excel store - + Args: platform: Platform name (xhs, dy, ks, etc.) crawler_type: Type of crawler (search, detail, creator) @@ -57,39 +105,45 @@ class ExcelStoreBase(AbstractStore): "openpyxl is required for Excel export. " "Install it with: pip install openpyxl" ) - + super().__init__() self.platform = platform self.crawler_type = crawler_type - + # Create data directory self.data_dir = Path("data") / platform self.data_dir.mkdir(parents=True, exist_ok=True) - + # Initialize workbook self.workbook = openpyxl.Workbook() self.workbook.remove(self.workbook.active) # Remove default sheet - + # Create sheets self.contents_sheet = self.workbook.create_sheet("Contents") self.comments_sheet = self.workbook.create_sheet("Comments") self.creators_sheet = self.workbook.create_sheet("Creators") - + # Track if headers are written self.contents_headers_written = False self.comments_headers_written = False self.creators_headers_written = False - + self.contacts_headers_written = False + self.dynamics_headers_written = False + + # Optional sheets for platforms that need them (e.g., Bilibili) + self.contacts_sheet = None + self.dynamics_sheet = None + # Generate filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.filename = self.data_dir / f"{platform}_{crawler_type}_{timestamp}.xlsx" - + utils.logger.info(f"[ExcelStoreBase] Initialized Excel export to: {self.filename}") - + def _apply_header_style(self, sheet, row_num: int = 1): """ Apply formatting to header row - + Args: sheet: Worksheet object row_num: Row number for headers (default: 1) @@ -103,70 +157,70 @@ class ExcelStoreBase(AbstractStore): top=Side(style='thin'), bottom=Side(style='thin') ) - + for cell in sheet[row_num]: cell.fill = header_fill cell.font = header_font cell.alignment = header_alignment cell.border = border - + def _auto_adjust_column_width(self, sheet): """ Auto-adjust column widths based on content - + Args: sheet: Worksheet object """ for column in sheet.columns: max_length = 0 column_letter = get_column_letter(column[0].column) - + for cell in column: try: if cell.value: max_length = max(max_length, len(str(cell.value))) - except: + except (TypeError, AttributeError): pass - + # Set width with min/max constraints adjusted_width = min(max(max_length + 2, 10), 50) sheet.column_dimensions[column_letter].width = adjusted_width - + def _write_headers(self, sheet, headers: List[str]): """ Write headers to sheet - + Args: sheet: Worksheet object headers: List of header names """ for col_num, header in enumerate(headers, 1): sheet.cell(row=1, column=col_num, value=header) - + self._apply_header_style(sheet) - + def _write_row(self, sheet, data: Dict[str, Any], headers: List[str]): """ Write data row to sheet - + Args: sheet: Worksheet object data: Data dictionary headers: List of header names (defines column order) """ row_num = sheet.max_row + 1 - + for col_num, header in enumerate(headers, 1): value = data.get(header, "") - + # Handle different data types if isinstance(value, (list, dict)): value = str(value) elif value is None: value = "" - + cell = sheet.cell(row=row_num, column=col_num, value=value) - + # Apply basic formatting cell.alignment = Alignment(vertical="top", wrap_text=True) cell.border = Border( @@ -175,89 +229,152 @@ class ExcelStoreBase(AbstractStore): top=Side(style='thin'), bottom=Side(style='thin') ) - + async def store_content(self, content_item: Dict): """ Store content data to Excel - + Args: content_item: Content data dictionary """ # Define headers (customize based on platform) headers = list(content_item.keys()) - + # Write headers if first time if not self.contents_headers_written: self._write_headers(self.contents_sheet, headers) self.contents_headers_written = True - + # Write data row self._write_row(self.contents_sheet, content_item, headers) - - utils.logger.info(f"[ExcelStoreBase] Stored content to Excel: {content_item.get('note_id', 'N/A')}") - + + # Get ID from various possible field names + content_id = content_item.get('note_id') or content_item.get('aweme_id') or content_item.get('video_id') or content_item.get('content_id') or 'N/A' + utils.logger.info(f"[ExcelStoreBase] Stored content to Excel: {content_id}") + async def store_comment(self, comment_item: Dict): """ Store comment data to Excel - + Args: comment_item: Comment data dictionary """ # Define headers headers = list(comment_item.keys()) - + # Write headers if first time if not self.comments_headers_written: self._write_headers(self.comments_sheet, headers) self.comments_headers_written = True - + # Write data row self._write_row(self.comments_sheet, comment_item, headers) - + utils.logger.info(f"[ExcelStoreBase] Stored comment to Excel: {comment_item.get('comment_id', 'N/A')}") - - async def store_creator(self, creator_item: Dict): + + async def store_creator(self, creator: Dict): """ Store creator data to Excel - + Args: - creator_item: Creator data dictionary + creator: Creator data dictionary """ # Define headers - headers = list(creator_item.keys()) - + headers = list(creator.keys()) + # Write headers if first time if not self.creators_headers_written: self._write_headers(self.creators_sheet, headers) self.creators_headers_written = True - + # Write data row - self._write_row(self.creators_sheet, creator_item, headers) - - utils.logger.info(f"[ExcelStoreBase] Stored creator to Excel: {creator_item.get('user_id', 'N/A')}") - + self._write_row(self.creators_sheet, creator, headers) + + utils.logger.info(f"[ExcelStoreBase] Stored creator to Excel: {creator.get('user_id', 'N/A')}") + + async def store_contact(self, contact_item: Dict): + """ + Store contact data to Excel (for platforms like Bilibili) + + Args: + contact_item: Contact data dictionary + """ + # Create contacts sheet if not exists + if self.contacts_sheet is None: + self.contacts_sheet = self.workbook.create_sheet("Contacts") + + # Define headers + headers = list(contact_item.keys()) + + # Write headers if first time + if not self.contacts_headers_written: + self._write_headers(self.contacts_sheet, headers) + self.contacts_headers_written = True + + # Write data row + self._write_row(self.contacts_sheet, contact_item, headers) + + utils.logger.info(f"[ExcelStoreBase] Stored contact to Excel: up_id={contact_item.get('up_id', 'N/A')}, fan_id={contact_item.get('fan_id', 'N/A')}") + + async def store_dynamic(self, dynamic_item: Dict): + """ + Store dynamic data to Excel (for platforms like Bilibili) + + Args: + dynamic_item: Dynamic data dictionary + """ + # Create dynamics sheet if not exists + if self.dynamics_sheet is None: + self.dynamics_sheet = self.workbook.create_sheet("Dynamics") + + # Define headers + headers = list(dynamic_item.keys()) + + # Write headers if first time + if not self.dynamics_headers_written: + self._write_headers(self.dynamics_sheet, headers) + self.dynamics_headers_written = True + + # Write data row + self._write_row(self.dynamics_sheet, dynamic_item, headers) + + utils.logger.info(f"[ExcelStoreBase] Stored dynamic to Excel: {dynamic_item.get('dynamic_id', 'N/A')}") + def flush(self): """ Save workbook to file """ try: - # Auto-adjust column widths + # Auto-adjust column widths for all sheets self._auto_adjust_column_width(self.contents_sheet) self._auto_adjust_column_width(self.comments_sheet) self._auto_adjust_column_width(self.creators_sheet) - - # Remove empty sheets + if self.contacts_sheet is not None: + self._auto_adjust_column_width(self.contacts_sheet) + if self.dynamics_sheet is not None: + self._auto_adjust_column_width(self.dynamics_sheet) + + # Remove empty sheets (only header row) if self.contents_sheet.max_row == 1: self.workbook.remove(self.contents_sheet) if self.comments_sheet.max_row == 1: self.workbook.remove(self.comments_sheet) if self.creators_sheet.max_row == 1: self.workbook.remove(self.creators_sheet) - + if self.contacts_sheet is not None and self.contacts_sheet.max_row == 1: + self.workbook.remove(self.contacts_sheet) + if self.dynamics_sheet is not None and self.dynamics_sheet.max_row == 1: + self.workbook.remove(self.dynamics_sheet) + + # Check if there are any sheets left + if len(self.workbook.sheetnames) == 0: + utils.logger.info(f"[ExcelStoreBase] No data to save, skipping file creation: {self.filename}") + return + # Save workbook self.workbook.save(self.filename) utils.logger.info(f"[ExcelStoreBase] Excel file saved successfully: {self.filename}") - + except Exception as e: utils.logger.error(f"[ExcelStoreBase] Error saving Excel file: {e}") raise diff --git a/store/kuaishou/__init__.py b/store/kuaishou/__init__.py index 21465e1..a1dd613 100644 --- a/store/kuaishou/__init__.py +++ b/store/kuaishou/__init__.py @@ -37,6 +37,7 @@ class KuaishouStoreFactory: "json": KuaishouJsonStoreImplement, "sqlite": KuaishouSqliteStoreImplement, "mongodb": KuaishouMongoStoreImplement, + "excel": KuaishouExcelStoreImplement, } @staticmethod @@ -44,7 +45,7 @@ class KuaishouStoreFactory: store_class = KuaishouStoreFactory.STORES.get(config.SAVE_DATA_OPTION) if not store_class: raise ValueError( - "[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...") + "[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...") return store_class() diff --git a/store/kuaishou/_store_impl.py b/store/kuaishou/_store_impl.py index 4fd4a77..9beaee5 100644 --- a/store/kuaishou/_store_impl.py +++ b/store/kuaishou/_store_impl.py @@ -226,3 +226,14 @@ class KuaishouMongoStoreImplement(AbstractStore): data=creator_item ) utils.logger.info(f"[KuaishouMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") + + +class KuaishouExcelStoreImplement: + """快手Excel存储实现 - 全局单例""" + + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="kuaishou", + crawler_type=crawler_type_var.get() + ) diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 4c8df24..349baef 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -34,6 +34,7 @@ class TieBaStoreFactory: "json": TieBaJsonStoreImplement, "sqlite": TieBaSqliteStoreImplement, "mongodb": TieBaMongoStoreImplement, + "excel": TieBaExcelStoreImplement, } @staticmethod @@ -41,7 +42,7 @@ class TieBaStoreFactory: store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION) if not store_class: raise ValueError( - "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...") + "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...") return store_class() diff --git a/store/tieba/_store_impl.py b/store/tieba/_store_impl.py index 30a7ac9..be22806 100644 --- a/store/tieba/_store_impl.py +++ b/store/tieba/_store_impl.py @@ -258,3 +258,14 @@ class TieBaMongoStoreImplement(AbstractStore): data=creator_item ) utils.logger.info(f"[TieBaMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") + + +class TieBaExcelStoreImplement: + """贴吧Excel存储实现 - 全局单例""" + + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="tieba", + crawler_type=crawler_type_var.get() + ) diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py index 6bde351..ec50e72 100644 --- a/store/weibo/__init__.py +++ b/store/weibo/__init__.py @@ -38,13 +38,14 @@ class WeibostoreFactory: "json": WeiboJsonStoreImplement, "sqlite": WeiboSqliteStoreImplement, "mongodb": WeiboMongoStoreImplement, + "excel": WeiboExcelStoreImplement, } @staticmethod def create_store() -> AbstractStore: store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION) if not store_class: - raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...") + raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...") return store_class() diff --git a/store/weibo/_store_impl.py b/store/weibo/_store_impl.py index 01c8c18..568ab74 100644 --- a/store/weibo/_store_impl.py +++ b/store/weibo/_store_impl.py @@ -280,3 +280,14 @@ class WeiboMongoStoreImplement(AbstractStore): data=creator_item ) utils.logger.info(f"[WeiboMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") + + +class WeiboExcelStoreImplement: + """微博Excel存储实现 - 全局单例""" + + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="weibo", + crawler_type=crawler_type_var.get() + ) diff --git a/store/xhs/_store_impl.py b/store/xhs/_store_impl.py index 0944c65..2031ad0 100644 --- a/store/xhs/_store_impl.py +++ b/store/xhs/_store_impl.py @@ -339,9 +339,12 @@ class XhsMongoStoreImplement(AbstractStore): utils.logger.info(f"[XhsMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") -class XhsExcelStoreImplement(ExcelStoreBase): - """小红书Excel存储实现""" +class XhsExcelStoreImplement: + """小红书Excel存储实现 - 全局单例""" - def __init__(self, **kwargs): - super().__init__(platform="xhs", crawler_type=crawler_type_var.get()) - utils.logger.info("[XhsExcelStoreImplement] Excel store initialized") + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="xhs", + crawler_type=crawler_type_var.get() + ) diff --git a/store/zhihu/__init__.py b/store/zhihu/__init__.py index ace071f..bebf36d 100644 --- a/store/zhihu/__init__.py +++ b/store/zhihu/__init__.py @@ -28,7 +28,8 @@ from ._store_impl import (ZhihuCsvStoreImplement, ZhihuDbStoreImplement, ZhihuJsonStoreImplement, ZhihuSqliteStoreImplement, - ZhihuMongoStoreImplement) + ZhihuMongoStoreImplement, + ZhihuExcelStoreImplement) from tools import utils from var import source_keyword_var @@ -40,13 +41,14 @@ class ZhihuStoreFactory: "json": ZhihuJsonStoreImplement, "sqlite": ZhihuSqliteStoreImplement, "mongodb": ZhihuMongoStoreImplement, + "excel": ZhihuExcelStoreImplement, } @staticmethod def create_store() -> AbstractStore: store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION) if not store_class: - raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...") + raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...") return store_class() async def batch_update_zhihu_contents(contents: List[ZhihuContent]): diff --git a/store/zhihu/_store_impl.py b/store/zhihu/_store_impl.py index 07a2e00..a12bd2d 100644 --- a/store/zhihu/_store_impl.py +++ b/store/zhihu/_store_impl.py @@ -257,3 +257,14 @@ class ZhihuMongoStoreImplement(AbstractStore): data=creator_item ) utils.logger.info(f"[ZhihuMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB") + + +class ZhihuExcelStoreImplement: + """知乎Excel存储实现 - 全局单例""" + + def __new__(cls, *args, **kwargs): + from store.excel_store_base import ExcelStoreBase + return ExcelStoreBase.get_instance( + platform="zhihu", + crawler_type=crawler_type_var.get() + ) diff --git a/tests/test_excel_store.py b/tests/test_excel_store.py index 18b55d4..63fd818 100644 --- a/tests/test_excel_store.py +++ b/tests/test_excel_store.py @@ -1,4 +1,21 @@ # -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tests/test_excel_store.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + """ Unit tests for Excel export functionality """ @@ -22,7 +39,14 @@ from store.excel_store_base import ExcelStoreBase @pytest.mark.skipif(not EXCEL_AVAILABLE, reason="openpyxl not installed") class TestExcelStoreBase: """Test cases for ExcelStoreBase""" - + + @pytest.fixture(autouse=True) + def clear_singleton_state(self): + """Clear singleton state before and after each test""" + ExcelStoreBase._instances.clear() + yield + ExcelStoreBase._instances.clear() + @pytest.fixture def temp_dir(self): """Create temporary directory for test files""" @@ -30,7 +54,7 @@ class TestExcelStoreBase: yield temp_path # Cleanup shutil.rmtree(temp_path, ignore_errors=True) - + @pytest.fixture def excel_store(self, temp_dir, monkeypatch): """Create ExcelStoreBase instance for testing""" @@ -39,7 +63,7 @@ class TestExcelStoreBase: store = ExcelStoreBase(platform="test", crawler_type="search") yield store # Cleanup is handled by temp_dir fixture - + def test_initialization(self, excel_store): """Test Excel store initialization""" assert excel_store.platform == "test" @@ -48,7 +72,7 @@ class TestExcelStoreBase: assert excel_store.contents_sheet is not None assert excel_store.comments_sheet is not None assert excel_store.creators_sheet is not None - + @pytest.mark.asyncio async def test_store_content(self, excel_store): """Test storing content data""" @@ -61,13 +85,13 @@ class TestExcelStoreBase: "liked_count": 100, "comment_count": 50 } - + await excel_store.store_content(content_item) - + # Verify data was written assert excel_store.contents_sheet.max_row == 2 # Header + 1 data row assert excel_store.contents_headers_written is True - + @pytest.mark.asyncio async def test_store_comment(self, excel_store): """Test storing comment data""" @@ -79,13 +103,13 @@ class TestExcelStoreBase: "nickname": "Commenter", "like_count": 10 } - + await excel_store.store_comment(comment_item) - + # Verify data was written assert excel_store.comments_sheet.max_row == 2 # Header + 1 data row assert excel_store.comments_headers_written is True - + @pytest.mark.asyncio async def test_store_creator(self, excel_store): """Test storing creator data""" @@ -96,13 +120,13 @@ class TestExcelStoreBase: "follows": 500, "interaction": 50000 } - + await excel_store.store_creator(creator_item) - + # Verify data was written assert excel_store.creators_sheet.max_row == 2 # Header + 1 data row assert excel_store.creators_headers_written is True - + @pytest.mark.asyncio async def test_multiple_items(self, excel_store): """Test storing multiple items""" @@ -113,10 +137,10 @@ class TestExcelStoreBase: "title": f"Title {i}", "liked_count": i * 10 }) - + # Verify all items were stored assert excel_store.contents_sheet.max_row == 6 # Header + 5 data rows - + def test_flush(self, excel_store): """Test flushing data to file""" # Add some test data @@ -124,38 +148,38 @@ class TestExcelStoreBase: "note_id": "test", "title": "Test" })) - + # Flush to file excel_store.flush() - + # Verify file was created assert excel_store.filename.exists() - + # Verify file can be opened wb = openpyxl.load_workbook(excel_store.filename) assert "Contents" in wb.sheetnames wb.close() - + def test_header_formatting(self, excel_store): """Test header row formatting""" asyncio.run(excel_store.store_content({"note_id": "test", "title": "Test"})) - + # Check header formatting header_cell = excel_store.contents_sheet.cell(row=1, column=1) assert header_cell.font.bold is True # RGB color may have different prefix (00 or FF), check the actual color part assert header_cell.fill.start_color.rgb[-6:] == "366092" - + def test_empty_sheets_removed(self, excel_store): """Test that empty sheets are removed on flush""" # Only add content, leave comments and creators empty asyncio.run(excel_store.store_content({"note_id": "test"})) - + excel_store.flush() - + # Reload workbook wb = openpyxl.load_workbook(excel_store.filename) - + # Only Contents sheet should exist assert "Contents" in wb.sheetnames assert "Comments" not in wb.sheetnames @@ -169,3 +193,83 @@ def test_excel_import_availability(): assert EXCEL_AVAILABLE is True import openpyxl assert openpyxl is not None + + +@pytest.mark.skipif(not EXCEL_AVAILABLE, reason="openpyxl not installed") +class TestSingletonPattern: + """Test singleton pattern for Excel store""" + + @pytest.fixture(autouse=True) + def setup_and_teardown(self, tmp_path, monkeypatch): + """Setup and teardown for each test""" + # Change to temp directory + monkeypatch.chdir(tmp_path) + # Clear singleton instances before each test + ExcelStoreBase._instances.clear() + yield + # Cleanup after test + ExcelStoreBase._instances.clear() + + def test_get_instance_returns_same_instance(self): + """Test that get_instance returns the same instance for same parameters""" + instance1 = ExcelStoreBase.get_instance("xhs", "search") + instance2 = ExcelStoreBase.get_instance("xhs", "search") + + assert instance1 is instance2 + + def test_get_instance_different_params_returns_different_instances(self): + """Test that different parameters return different instances""" + instance1 = ExcelStoreBase.get_instance("xhs", "search") + instance2 = ExcelStoreBase.get_instance("xhs", "detail") + instance3 = ExcelStoreBase.get_instance("douyin", "search") + + assert instance1 is not instance2 + assert instance1 is not instance3 + assert instance2 is not instance3 + + @pytest.mark.asyncio + async def test_singleton_preserves_data(self): + """Test that singleton pattern preserves data across multiple calls""" + # First call - store some content + store1 = ExcelStoreBase.get_instance("test", "search") + await store1.store_content({"note_id": "note1", "title": "Title 1"}) + + # Second call - should get same instance with data + store2 = ExcelStoreBase.get_instance("test", "search") + await store2.store_content({"note_id": "note2", "title": "Title 2"}) + + # Verify both items are in the same workbook + assert store1 is store2 + assert store1.contents_sheet.max_row == 3 # Header + 2 data rows + + def test_flush_all_saves_all_instances(self, tmp_path): + """Test that flush_all saves all instances""" + # Create multiple instances + store1 = ExcelStoreBase.get_instance("platform1", "search") + store2 = ExcelStoreBase.get_instance("platform2", "search") + + # Add data to each + asyncio.run(store1.store_content({"note_id": "note1"})) + asyncio.run(store2.store_content({"note_id": "note2"})) + + # Flush all + ExcelStoreBase.flush_all() + + # Verify instances are cleared + assert len(ExcelStoreBase._instances) == 0 + + # Verify files were created + assert store1.filename.exists() + assert store2.filename.exists() + + def test_flush_all_clears_instances(self): + """Test that flush_all clears the instances dictionary""" + # Create an instance + ExcelStoreBase.get_instance("test", "search") + assert len(ExcelStoreBase._instances) == 1 + + # Flush all + ExcelStoreBase.flush_all() + + # Verify instances are cleared + assert len(ExcelStoreBase._instances) == 0 diff --git a/uv.lock b/uv.lock index b75e71f..3bdde1b 100644 --- a/uv.lock +++ b/uv.lock @@ -171,9 +171,9 @@ wheels = [ name = "cfgv" version = "3.4.0" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114 } wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249 }, ] [[package]] @@ -376,6 +376,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094 }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, +] + [[package]] name = "fastapi" version = "0.110.2" @@ -513,6 +522,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 }, +] + [[package]] name = "jieba" version = "0.42.1" @@ -777,6 +795,7 @@ dependencies = [ { name = "matplotlib" }, { name = "motor" }, { name = "opencv-python" }, + { name = "openpyxl" }, { name = "pandas" }, { name = "parsel" }, { name = "pillow" }, @@ -785,6 +804,8 @@ dependencies = [ { name = "pydantic" }, { name = "pyexecjs" }, { name = "pyhumps" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, { name = "python-dotenv" }, { name = "redis" }, { name = "requests" }, @@ -810,6 +831,7 @@ requires-dist = [ { name = "matplotlib", specifier = "==3.9.0" }, { name = "motor", specifier = ">=3.3.0" }, { name = "opencv-python", specifier = ">=4.11.0.86" }, + { name = "openpyxl", specifier = ">=3.1.2" }, { name = "pandas", specifier = "==2.2.3" }, { name = "parsel", specifier = "==1.9.1" }, { name = "pillow", specifier = "==9.5.0" }, @@ -818,6 +840,8 @@ requires-dist = [ { name = "pydantic", specifier = "==2.5.2" }, { name = "pyexecjs", specifier = "==1.5.1" }, { name = "pyhumps", specifier = ">=3.8.0" }, + { name = "pytest", specifier = ">=7.4.0" }, + { name = "pytest-asyncio", specifier = ">=0.21.0" }, { name = "python-dotenv", specifier = "==1.0.1" }, { name = "redis", specifier = "~=4.6.0" }, { name = "requests", specifier = "==2.32.3" }, @@ -925,6 +949,18 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044 }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, +] + [[package]] name = "packaging" version = "25.0" @@ -1040,6 +1076,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/87/0f/c8dcadb2f0dcfdab6052d5ecf57ccf19b439c0adc29fc510ed0830349345/playwright-1.45.0-py3-none-win_amd64.whl", hash = "sha256:701db496928429aec103739e48e3110806bd5cf49456cc95b89f28e1abda71da", size = 29692683 }, ] +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538 }, +] + [[package]] name = "pre-commit" version = "4.4.0" @@ -1234,6 +1279,35 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120 }, ] +[[package]] +name = "pytest" +version = "9.0.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/56/f013048ac4bc4c1d9be45afd4ab209ea62822fb1598f40687e6bf45dcea4/pytest-9.0.1.tar.gz", hash = "sha256:3e9c069ea73583e255c3b21cf46b8d3c56f6e3a1a8f6da94ccb0fcf57b9d73c8", size = 1564125 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/8b/6300fb80f858cda1c51ffa17075df5d846757081d11ab4aa35cef9e6258b/pytest-9.0.1-py3-none-any.whl", hash = "sha256:67be0030d194df2dfa7b556f2e56fb3c3315bd5c8822c6951162b92b32ce7dad", size = 373668 }, +] + +[[package]] +name = "pytest-asyncio" +version = "1.3.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "pytest" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"