feat: 新增 JSONL 存储格式支持,默认存储格式改为 jsonl

JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入,
无需读取已有数据,大数据量下性能远优于 JSON 格式。

- 新增 AsyncFileWriter.write_to_jsonl() 核心方法
- 7 个平台新增 JsonlStoreImplement 类并注册到工厂
- 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新
- db_session.py 守卫条件加入 jsonl,避免误触 ValueError
- 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json
- 原有 json 选项完全保留,向后兼容
- 更新相关文档和测试
This commit is contained in:
程序员阿江(Relakkes)
2026-03-03 23:31:07 +08:00
parent 4331b91fe1
commit 0282e626c9
29 changed files with 245 additions and 47 deletions

View File

@@ -133,7 +133,7 @@ class BiliDbStoreImplement(AbstractStore):
content_item["user_id"] = int(content_item.get("user_id", 0) or 0)
content_item["liked_count"] = int(content_item.get("liked_count", 0) or 0)
content_item["create_time"] = int(content_item.get("create_time", 0) or 0)
async with get_session() as session:
result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id))
video_detail = result.scalar_one_or_none()
@@ -162,7 +162,7 @@ class BiliDbStoreImplement(AbstractStore):
comment_item["like_count"] = str(comment_item.get("like_count", "0"))
comment_item["sub_comment_count"] = str(comment_item.get("sub_comment_count", "0"))
comment_item["parent_comment_id"] = str(comment_item.get("parent_comment_id", "0"))
async with get_session() as session:
result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id))
comment_detail = result.scalar_one_or_none()
@@ -242,7 +242,7 @@ class BiliDbStoreImplement(AbstractStore):
"""
dynamic_id = int(dynamic_item.get("dynamic_id"))
dynamic_item["dynamic_id"] = dynamic_id
async with get_session() as session:
result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id))
dynamic_detail = result.scalar_one_or_none()
@@ -338,6 +338,44 @@ class BiliJsonStoreImplement(AbstractStore):
class BiliJsonlStoreImplement(AbstractStore):
def __init__(self):
self.file_writer = AsyncFileWriter(
crawler_type=crawler_type_var.get(),
platform="bili"
)
async def store_content(self, content_item: Dict):
await self.file_writer.write_to_jsonl(
item=content_item,
item_type="contents"
)
async def store_comment(self, comment_item: Dict):
await self.file_writer.write_to_jsonl(
item=comment_item,
item_type="comments"
)
async def store_creator(self, creator: Dict):
await self.file_writer.write_to_jsonl(
item=creator,
item_type="creators"
)
async def store_contact(self, contact_item: Dict):
await self.file_writer.write_to_jsonl(
item=contact_item,
item_type="contacts"
)
async def store_dynamic(self, dynamic_item: Dict):
await self.file_writer.write_to_jsonl(
item=dynamic_item,
item_type="dynamics"
)
class BiliSqliteStoreImplement(BiliDbStoreImplement):
pass