Files
MediaCrawler/store/zhihu/_store_impl.py
程序员阿江(Relakkes) 0282e626c9 feat: 新增 JSONL 存储格式支持,默认存储格式改为 jsonl
JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入,
无需读取已有数据,大数据量下性能远优于 JSON 格式。

- 新增 AsyncFileWriter.write_to_jsonl() 核心方法
- 7 个平台新增 JsonlStoreImplement 类并注册到工厂
- 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新
- db_session.py 守卫条件加入 jsonl,避免误触 ValueError
- 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json
- 原有 json 选项完全保留,向后兼容
- 更新相关文档和测试
2026-03-03 23:31:07 +08:00

295 lines
9.6 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/store/zhihu/_store_impl.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : persist1@126.com
# @Time : 2025/9/5 19:34
# @Desc : Zhihu storage implementation class
import asyncio
import csv
import json
import os
import pathlib
from typing import Dict
import aiofiles
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
import config
from base.base_crawler import AbstractStore
from database.db_session import get_session
from database.models import ZhihuContent, ZhihuComment, ZhihuCreator
from tools import utils, words
from var import crawler_type_var
from tools.async_file_writer import AsyncFileWriter
from database.mongodb_store_base import MongoDBStoreBase
def calculate_number_of_files(file_store_path: str) -> int:
"""Calculate the prefix sorting number for data save files, supporting writing to different files for each run
Args:
file_store_path;
Returns:
file nums
"""
if not os.path.exists(file_store_path):
return 1
try:
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
except ValueError:
return 1
class ZhihuCsvStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.writer = AsyncFileWriter(platform="zhihu", crawler_type=crawler_type_var.get())
async def store_content(self, content_item: Dict):
"""
Zhihu content CSV storage implementation
Args:
content_item: note item dict
Returns:
"""
await self.writer.write_to_csv(item_type="contents", item=content_item)
async def store_comment(self, comment_item: Dict):
"""
Zhihu comment CSV storage implementation
Args:
comment_item: comment item dict
Returns:
"""
await self.writer.write_to_csv(item_type="comments", item=comment_item)
async def store_creator(self, creator: Dict):
"""
Zhihu content CSV storage implementation
Args:
creator: creator dict
Returns:
"""
await self.writer.write_to_csv(item_type="creators", item=creator)
class ZhihuDbStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
"""
Zhihu content DB storage implementation
Args:
content_item: content item dict
"""
content_id = content_item.get("content_id")
async with get_session() as session:
stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
result = await session.execute(stmt)
existing_content = result.scalars().first()
if existing_content:
for key, value in content_item.items():
if hasattr(existing_content, key):
setattr(existing_content, key, value)
else:
if "add_ts" not in content_item:
content_item["add_ts"] = utils.get_current_timestamp()
new_content = ZhihuContent(**content_item)
session.add(new_content)
await session.commit()
async def store_comment(self, comment_item: Dict):
"""
Zhihu content DB storage implementation
Args:
comment_item: comment item dict
"""
comment_id = comment_item.get("comment_id")
async with get_session() as session:
stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
result = await session.execute(stmt)
existing_comment = result.scalars().first()
if existing_comment:
for key, value in comment_item.items():
if hasattr(existing_comment, key):
setattr(existing_comment, key, value)
else:
if "add_ts" not in comment_item:
comment_item["add_ts"] = utils.get_current_timestamp()
new_comment = ZhihuComment(**comment_item)
session.add(new_comment)
await session.commit()
async def store_creator(self, creator: Dict):
"""
Zhihu content DB storage implementation
Args:
creator: creator dict
"""
user_id = creator.get("user_id")
async with get_session() as session:
stmt = select(ZhihuCreator).where(ZhihuCreator.user_id == user_id)
result = await session.execute(stmt)
existing_creator = result.scalars().first()
if existing_creator:
for key, value in creator.items():
if hasattr(existing_creator, key):
setattr(existing_creator, key, value)
else:
if "add_ts" not in creator:
creator["add_ts"] = utils.get_current_timestamp()
new_creator = ZhihuCreator(**creator)
session.add(new_creator)
await session.commit()
class ZhihuJsonStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.writer = AsyncFileWriter(platform="zhihu", crawler_type=crawler_type_var.get())
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
Args:
content_item:
Returns:
"""
await self.writer.write_single_item_to_json(item_type="contents", item=content_item)
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementation
Args:
comment_item:
Returns:
"""
await self.writer.write_single_item_to_json(item_type="comments", item=comment_item)
async def store_creator(self, creator: Dict):
"""
Zhihu content JSON storage implementation
Args:
creator: creator dict
Returns:
"""
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
class ZhihuJsonlStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.writer = AsyncFileWriter(platform="zhihu", crawler_type=crawler_type_var.get())
async def store_content(self, content_item: Dict):
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
async def store_comment(self, comment_item: Dict):
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
async def store_creator(self, creator: Dict):
await self.writer.write_to_jsonl(item_type="creators", item=creator)
class ZhihuSqliteStoreImplement(ZhihuDbStoreImplement):
"""
Zhihu content SQLite storage implementation
"""
pass
class ZhihuMongoStoreImplement(AbstractStore):
"""Zhihu MongoDB storage implementation"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="zhihu")
async def store_content(self, content_item: Dict):
"""
Store content to MongoDB
Args:
content_item: Content data
"""
note_id = content_item.get("note_id")
if not note_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"note_id": note_id},
data=content_item
)
utils.logger.info(f"[ZhihuMongoStoreImplement.store_content] Saved note {note_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
Store comment to MongoDB
Args:
comment_item: Comment data
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[ZhihuMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
Store creator information to MongoDB
Args:
creator_item: Creator data
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[ZhihuMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")
class ZhihuExcelStoreImplement:
"""Zhihu Excel storage implementation - Global singleton"""
def __new__(cls, *args, **kwargs):
from store.excel_store_base import ExcelStoreBase
return ExcelStoreBase.get_instance(
platform="zhihu",
crawler_type=crawler_type_var.get()
)