mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-06 01:47:26 +08:00
feat: excel store with other platform
This commit is contained in:
@@ -38,13 +38,14 @@ class BiliStoreFactory:
|
||||
"json": BiliJsonStoreImplement,
|
||||
"sqlite": BiliSqliteStoreImplement,
|
||||
"mongodb": BiliMongoStoreImplement,
|
||||
"excel": BiliExcelStoreImplement,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_store() -> AbstractStore:
|
||||
store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
|
||||
raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
|
||||
@@ -365,3 +365,14 @@ class BiliMongoStoreImplement(AbstractStore):
|
||||
data=creator_item
|
||||
)
|
||||
utils.logger.info(f"[BiliMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class BiliExcelStoreImplement:
|
||||
"""B站Excel存储实现 - 全局单例"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="bilibili",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
@@ -37,13 +37,14 @@ class DouyinStoreFactory:
|
||||
"json": DouyinJsonStoreImplement,
|
||||
"sqlite": DouyinSqliteStoreImplement,
|
||||
"mongodb": DouyinMongoStoreImplement,
|
||||
"excel": DouyinExcelStoreImplement,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_store() -> AbstractStore:
|
||||
store_class = DouyinStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
|
||||
raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
|
||||
@@ -264,3 +264,14 @@ class DouyinMongoStoreImplement(AbstractStore):
|
||||
data=creator_item
|
||||
)
|
||||
utils.logger.info(f"[DouyinMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class DouyinExcelStoreImplement:
|
||||
"""抖音Excel存储实现 - 全局单例"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="douyin",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
@@ -2,10 +2,20 @@
|
||||
# Copyright (c) 2025 relakkes@gmail.com
|
||||
#
|
||||
# This file is part of MediaCrawler project.
|
||||
# Repository: https://github.com/NanmiCoder/MediaCrawler
|
||||
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/store/excel_store_base.py
|
||||
# GitHub: https://github.com/NanmiCoder
|
||||
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||
#
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
@@ -21,7 +31,7 @@ Excel Store Base Implementation
|
||||
Provides Excel export functionality for crawled data with formatted sheets
|
||||
"""
|
||||
|
||||
import os
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
@@ -42,12 +52,50 @@ class ExcelStoreBase(AbstractStore):
|
||||
"""
|
||||
Base class for Excel storage implementation
|
||||
Provides formatted Excel export with multiple sheets for contents, comments, and creators
|
||||
Uses singleton pattern to maintain state across multiple store calls
|
||||
"""
|
||||
|
||||
|
||||
# Class-level singleton management
|
||||
_instances: Dict[str, "ExcelStoreBase"] = {}
|
||||
_lock = threading.Lock()
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, platform: str, crawler_type: str) -> "ExcelStoreBase":
|
||||
"""
|
||||
Get or create a singleton instance for the given platform and crawler type
|
||||
|
||||
Args:
|
||||
platform: Platform name (xhs, dy, ks, etc.)
|
||||
crawler_type: Type of crawler (search, detail, creator)
|
||||
|
||||
Returns:
|
||||
ExcelStoreBase instance
|
||||
"""
|
||||
key = f"{platform}_{crawler_type}"
|
||||
with cls._lock:
|
||||
if key not in cls._instances:
|
||||
cls._instances[key] = cls(platform, crawler_type)
|
||||
return cls._instances[key]
|
||||
|
||||
@classmethod
|
||||
def flush_all(cls):
|
||||
"""
|
||||
Flush all Excel store instances and save to files
|
||||
Should be called at the end of crawler execution
|
||||
"""
|
||||
with cls._lock:
|
||||
for key, instance in cls._instances.items():
|
||||
try:
|
||||
instance.flush()
|
||||
utils.logger.info(f"[ExcelStoreBase] Flushed instance: {key}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ExcelStoreBase] Error flushing {key}: {e}")
|
||||
cls._instances.clear()
|
||||
|
||||
def __init__(self, platform: str, crawler_type: str = "search"):
|
||||
"""
|
||||
Initialize Excel store
|
||||
|
||||
|
||||
Args:
|
||||
platform: Platform name (xhs, dy, ks, etc.)
|
||||
crawler_type: Type of crawler (search, detail, creator)
|
||||
@@ -57,39 +105,45 @@ class ExcelStoreBase(AbstractStore):
|
||||
"openpyxl is required for Excel export. "
|
||||
"Install it with: pip install openpyxl"
|
||||
)
|
||||
|
||||
|
||||
super().__init__()
|
||||
self.platform = platform
|
||||
self.crawler_type = crawler_type
|
||||
|
||||
|
||||
# Create data directory
|
||||
self.data_dir = Path("data") / platform
|
||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# Initialize workbook
|
||||
self.workbook = openpyxl.Workbook()
|
||||
self.workbook.remove(self.workbook.active) # Remove default sheet
|
||||
|
||||
|
||||
# Create sheets
|
||||
self.contents_sheet = self.workbook.create_sheet("Contents")
|
||||
self.comments_sheet = self.workbook.create_sheet("Comments")
|
||||
self.creators_sheet = self.workbook.create_sheet("Creators")
|
||||
|
||||
|
||||
# Track if headers are written
|
||||
self.contents_headers_written = False
|
||||
self.comments_headers_written = False
|
||||
self.creators_headers_written = False
|
||||
|
||||
self.contacts_headers_written = False
|
||||
self.dynamics_headers_written = False
|
||||
|
||||
# Optional sheets for platforms that need them (e.g., Bilibili)
|
||||
self.contacts_sheet = None
|
||||
self.dynamics_sheet = None
|
||||
|
||||
# Generate filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self.filename = self.data_dir / f"{platform}_{crawler_type}_{timestamp}.xlsx"
|
||||
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Initialized Excel export to: {self.filename}")
|
||||
|
||||
|
||||
def _apply_header_style(self, sheet, row_num: int = 1):
|
||||
"""
|
||||
Apply formatting to header row
|
||||
|
||||
|
||||
Args:
|
||||
sheet: Worksheet object
|
||||
row_num: Row number for headers (default: 1)
|
||||
@@ -103,70 +157,70 @@ class ExcelStoreBase(AbstractStore):
|
||||
top=Side(style='thin'),
|
||||
bottom=Side(style='thin')
|
||||
)
|
||||
|
||||
|
||||
for cell in sheet[row_num]:
|
||||
cell.fill = header_fill
|
||||
cell.font = header_font
|
||||
cell.alignment = header_alignment
|
||||
cell.border = border
|
||||
|
||||
|
||||
def _auto_adjust_column_width(self, sheet):
|
||||
"""
|
||||
Auto-adjust column widths based on content
|
||||
|
||||
|
||||
Args:
|
||||
sheet: Worksheet object
|
||||
"""
|
||||
for column in sheet.columns:
|
||||
max_length = 0
|
||||
column_letter = get_column_letter(column[0].column)
|
||||
|
||||
|
||||
for cell in column:
|
||||
try:
|
||||
if cell.value:
|
||||
max_length = max(max_length, len(str(cell.value)))
|
||||
except:
|
||||
except (TypeError, AttributeError):
|
||||
pass
|
||||
|
||||
|
||||
# Set width with min/max constraints
|
||||
adjusted_width = min(max(max_length + 2, 10), 50)
|
||||
sheet.column_dimensions[column_letter].width = adjusted_width
|
||||
|
||||
|
||||
def _write_headers(self, sheet, headers: List[str]):
|
||||
"""
|
||||
Write headers to sheet
|
||||
|
||||
|
||||
Args:
|
||||
sheet: Worksheet object
|
||||
headers: List of header names
|
||||
"""
|
||||
for col_num, header in enumerate(headers, 1):
|
||||
sheet.cell(row=1, column=col_num, value=header)
|
||||
|
||||
|
||||
self._apply_header_style(sheet)
|
||||
|
||||
|
||||
def _write_row(self, sheet, data: Dict[str, Any], headers: List[str]):
|
||||
"""
|
||||
Write data row to sheet
|
||||
|
||||
|
||||
Args:
|
||||
sheet: Worksheet object
|
||||
data: Data dictionary
|
||||
headers: List of header names (defines column order)
|
||||
"""
|
||||
row_num = sheet.max_row + 1
|
||||
|
||||
|
||||
for col_num, header in enumerate(headers, 1):
|
||||
value = data.get(header, "")
|
||||
|
||||
|
||||
# Handle different data types
|
||||
if isinstance(value, (list, dict)):
|
||||
value = str(value)
|
||||
elif value is None:
|
||||
value = ""
|
||||
|
||||
|
||||
cell = sheet.cell(row=row_num, column=col_num, value=value)
|
||||
|
||||
|
||||
# Apply basic formatting
|
||||
cell.alignment = Alignment(vertical="top", wrap_text=True)
|
||||
cell.border = Border(
|
||||
@@ -175,89 +229,152 @@ class ExcelStoreBase(AbstractStore):
|
||||
top=Side(style='thin'),
|
||||
bottom=Side(style='thin')
|
||||
)
|
||||
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Store content data to Excel
|
||||
|
||||
|
||||
Args:
|
||||
content_item: Content data dictionary
|
||||
"""
|
||||
# Define headers (customize based on platform)
|
||||
headers = list(content_item.keys())
|
||||
|
||||
|
||||
# Write headers if first time
|
||||
if not self.contents_headers_written:
|
||||
self._write_headers(self.contents_sheet, headers)
|
||||
self.contents_headers_written = True
|
||||
|
||||
|
||||
# Write data row
|
||||
self._write_row(self.contents_sheet, content_item, headers)
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored content to Excel: {content_item.get('note_id', 'N/A')}")
|
||||
|
||||
|
||||
# Get ID from various possible field names
|
||||
content_id = content_item.get('note_id') or content_item.get('aweme_id') or content_item.get('video_id') or content_item.get('content_id') or 'N/A'
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored content to Excel: {content_id}")
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Store comment data to Excel
|
||||
|
||||
|
||||
Args:
|
||||
comment_item: Comment data dictionary
|
||||
"""
|
||||
# Define headers
|
||||
headers = list(comment_item.keys())
|
||||
|
||||
|
||||
# Write headers if first time
|
||||
if not self.comments_headers_written:
|
||||
self._write_headers(self.comments_sheet, headers)
|
||||
self.comments_headers_written = True
|
||||
|
||||
|
||||
# Write data row
|
||||
self._write_row(self.comments_sheet, comment_item, headers)
|
||||
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored comment to Excel: {comment_item.get('comment_id', 'N/A')}")
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Store creator data to Excel
|
||||
|
||||
|
||||
Args:
|
||||
creator_item: Creator data dictionary
|
||||
creator: Creator data dictionary
|
||||
"""
|
||||
# Define headers
|
||||
headers = list(creator_item.keys())
|
||||
|
||||
headers = list(creator.keys())
|
||||
|
||||
# Write headers if first time
|
||||
if not self.creators_headers_written:
|
||||
self._write_headers(self.creators_sheet, headers)
|
||||
self.creators_headers_written = True
|
||||
|
||||
|
||||
# Write data row
|
||||
self._write_row(self.creators_sheet, creator_item, headers)
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored creator to Excel: {creator_item.get('user_id', 'N/A')}")
|
||||
|
||||
self._write_row(self.creators_sheet, creator, headers)
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored creator to Excel: {creator.get('user_id', 'N/A')}")
|
||||
|
||||
async def store_contact(self, contact_item: Dict):
|
||||
"""
|
||||
Store contact data to Excel (for platforms like Bilibili)
|
||||
|
||||
Args:
|
||||
contact_item: Contact data dictionary
|
||||
"""
|
||||
# Create contacts sheet if not exists
|
||||
if self.contacts_sheet is None:
|
||||
self.contacts_sheet = self.workbook.create_sheet("Contacts")
|
||||
|
||||
# Define headers
|
||||
headers = list(contact_item.keys())
|
||||
|
||||
# Write headers if first time
|
||||
if not self.contacts_headers_written:
|
||||
self._write_headers(self.contacts_sheet, headers)
|
||||
self.contacts_headers_written = True
|
||||
|
||||
# Write data row
|
||||
self._write_row(self.contacts_sheet, contact_item, headers)
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored contact to Excel: up_id={contact_item.get('up_id', 'N/A')}, fan_id={contact_item.get('fan_id', 'N/A')}")
|
||||
|
||||
async def store_dynamic(self, dynamic_item: Dict):
|
||||
"""
|
||||
Store dynamic data to Excel (for platforms like Bilibili)
|
||||
|
||||
Args:
|
||||
dynamic_item: Dynamic data dictionary
|
||||
"""
|
||||
# Create dynamics sheet if not exists
|
||||
if self.dynamics_sheet is None:
|
||||
self.dynamics_sheet = self.workbook.create_sheet("Dynamics")
|
||||
|
||||
# Define headers
|
||||
headers = list(dynamic_item.keys())
|
||||
|
||||
# Write headers if first time
|
||||
if not self.dynamics_headers_written:
|
||||
self._write_headers(self.dynamics_sheet, headers)
|
||||
self.dynamics_headers_written = True
|
||||
|
||||
# Write data row
|
||||
self._write_row(self.dynamics_sheet, dynamic_item, headers)
|
||||
|
||||
utils.logger.info(f"[ExcelStoreBase] Stored dynamic to Excel: {dynamic_item.get('dynamic_id', 'N/A')}")
|
||||
|
||||
def flush(self):
|
||||
"""
|
||||
Save workbook to file
|
||||
"""
|
||||
try:
|
||||
# Auto-adjust column widths
|
||||
# Auto-adjust column widths for all sheets
|
||||
self._auto_adjust_column_width(self.contents_sheet)
|
||||
self._auto_adjust_column_width(self.comments_sheet)
|
||||
self._auto_adjust_column_width(self.creators_sheet)
|
||||
|
||||
# Remove empty sheets
|
||||
if self.contacts_sheet is not None:
|
||||
self._auto_adjust_column_width(self.contacts_sheet)
|
||||
if self.dynamics_sheet is not None:
|
||||
self._auto_adjust_column_width(self.dynamics_sheet)
|
||||
|
||||
# Remove empty sheets (only header row)
|
||||
if self.contents_sheet.max_row == 1:
|
||||
self.workbook.remove(self.contents_sheet)
|
||||
if self.comments_sheet.max_row == 1:
|
||||
self.workbook.remove(self.comments_sheet)
|
||||
if self.creators_sheet.max_row == 1:
|
||||
self.workbook.remove(self.creators_sheet)
|
||||
|
||||
if self.contacts_sheet is not None and self.contacts_sheet.max_row == 1:
|
||||
self.workbook.remove(self.contacts_sheet)
|
||||
if self.dynamics_sheet is not None and self.dynamics_sheet.max_row == 1:
|
||||
self.workbook.remove(self.dynamics_sheet)
|
||||
|
||||
# Check if there are any sheets left
|
||||
if len(self.workbook.sheetnames) == 0:
|
||||
utils.logger.info(f"[ExcelStoreBase] No data to save, skipping file creation: {self.filename}")
|
||||
return
|
||||
|
||||
# Save workbook
|
||||
self.workbook.save(self.filename)
|
||||
utils.logger.info(f"[ExcelStoreBase] Excel file saved successfully: {self.filename}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ExcelStoreBase] Error saving Excel file: {e}")
|
||||
raise
|
||||
|
||||
@@ -37,6 +37,7 @@ class KuaishouStoreFactory:
|
||||
"json": KuaishouJsonStoreImplement,
|
||||
"sqlite": KuaishouSqliteStoreImplement,
|
||||
"mongodb": KuaishouMongoStoreImplement,
|
||||
"excel": KuaishouExcelStoreImplement,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -44,7 +45,7 @@ class KuaishouStoreFactory:
|
||||
store_class = KuaishouStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError(
|
||||
"[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
|
||||
"[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
|
||||
@@ -226,3 +226,14 @@ class KuaishouMongoStoreImplement(AbstractStore):
|
||||
data=creator_item
|
||||
)
|
||||
utils.logger.info(f"[KuaishouMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class KuaishouExcelStoreImplement:
|
||||
"""快手Excel存储实现 - 全局单例"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="kuaishou",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
@@ -34,6 +34,7 @@ class TieBaStoreFactory:
|
||||
"json": TieBaJsonStoreImplement,
|
||||
"sqlite": TieBaSqliteStoreImplement,
|
||||
"mongodb": TieBaMongoStoreImplement,
|
||||
"excel": TieBaExcelStoreImplement,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -41,7 +42,7 @@ class TieBaStoreFactory:
|
||||
store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError(
|
||||
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
|
||||
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
|
||||
@@ -258,3 +258,14 @@ class TieBaMongoStoreImplement(AbstractStore):
|
||||
data=creator_item
|
||||
)
|
||||
utils.logger.info(f"[TieBaMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class TieBaExcelStoreImplement:
|
||||
"""贴吧Excel存储实现 - 全局单例"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="tieba",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
@@ -38,13 +38,14 @@ class WeibostoreFactory:
|
||||
"json": WeiboJsonStoreImplement,
|
||||
"sqlite": WeiboSqliteStoreImplement,
|
||||
"mongodb": WeiboMongoStoreImplement,
|
||||
"excel": WeiboExcelStoreImplement,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_store() -> AbstractStore:
|
||||
store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
|
||||
raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
|
||||
@@ -280,3 +280,14 @@ class WeiboMongoStoreImplement(AbstractStore):
|
||||
data=creator_item
|
||||
)
|
||||
utils.logger.info(f"[WeiboMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class WeiboExcelStoreImplement:
|
||||
"""微博Excel存储实现 - 全局单例"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="weibo",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
@@ -339,9 +339,12 @@ class XhsMongoStoreImplement(AbstractStore):
|
||||
utils.logger.info(f"[XhsMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class XhsExcelStoreImplement(ExcelStoreBase):
|
||||
"""小红书Excel存储实现"""
|
||||
class XhsExcelStoreImplement:
|
||||
"""小红书Excel存储实现 - 全局单例"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(platform="xhs", crawler_type=crawler_type_var.get())
|
||||
utils.logger.info("[XhsExcelStoreImplement] Excel store initialized")
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="xhs",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
@@ -28,7 +28,8 @@ from ._store_impl import (ZhihuCsvStoreImplement,
|
||||
ZhihuDbStoreImplement,
|
||||
ZhihuJsonStoreImplement,
|
||||
ZhihuSqliteStoreImplement,
|
||||
ZhihuMongoStoreImplement)
|
||||
ZhihuMongoStoreImplement,
|
||||
ZhihuExcelStoreImplement)
|
||||
from tools import utils
|
||||
from var import source_keyword_var
|
||||
|
||||
@@ -40,13 +41,14 @@ class ZhihuStoreFactory:
|
||||
"json": ZhihuJsonStoreImplement,
|
||||
"sqlite": ZhihuSqliteStoreImplement,
|
||||
"mongodb": ZhihuMongoStoreImplement,
|
||||
"excel": ZhihuExcelStoreImplement,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_store() -> AbstractStore:
|
||||
store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
|
||||
raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb or excel ...")
|
||||
return store_class()
|
||||
|
||||
async def batch_update_zhihu_contents(contents: List[ZhihuContent]):
|
||||
|
||||
@@ -257,3 +257,14 @@ class ZhihuMongoStoreImplement(AbstractStore):
|
||||
data=creator_item
|
||||
)
|
||||
utils.logger.info(f"[ZhihuMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
|
||||
|
||||
|
||||
class ZhihuExcelStoreImplement:
|
||||
"""知乎Excel存储实现 - 全局单例"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from store.excel_store_base import ExcelStoreBase
|
||||
return ExcelStoreBase.get_instance(
|
||||
platform="zhihu",
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user