feat: 新增 JSONL 存储格式支持,默认存储格式改为 jsonl

JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入,
无需读取已有数据,大数据量下性能远优于 JSON 格式。

- 新增 AsyncFileWriter.write_to_jsonl() 核心方法
- 7 个平台新增 JsonlStoreImplement 类并注册到工厂
- 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新
- db_session.py 守卫条件加入 jsonl,避免误触 ValueError
- 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json
- 原有 json 选项完全保留,向后兼容
- 更新相关文档和测试
This commit is contained in:
程序员阿江(Relakkes)
2026-03-03 23:31:07 +08:00
parent 4331b91fe1
commit 0282e626c9
29 changed files with 245 additions and 47 deletions

View File

@@ -53,6 +53,12 @@ class AsyncFileWriter:
await writer.writeheader()
await writer.writerow(item)
async def write_to_jsonl(self, item: Dict, item_type: str):
file_path = self._get_file_path('jsonl', item_type)
async with self.lock:
async with aiofiles.open(file_path, 'a', encoding='utf-8') as f:
await f.write(json.dumps(item, ensure_ascii=False) + '\n')
async def write_single_item_to_json(self, item: Dict, item_type: str):
file_path = self._get_file_path('json', item_type)
async with self.lock:
@@ -85,22 +91,32 @@ class AsyncFileWriter:
return
try:
# Read comments from JSON file
comments_file_path = self._get_file_path('json', 'comments')
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
# Read comments from JSON or JSONL file
comments_data = []
jsonl_file_path = self._get_file_path('jsonl', 'comments')
json_file_path = self._get_file_path('json', 'comments')
if os.path.exists(jsonl_file_path) and os.path.getsize(jsonl_file_path) > 0:
async with aiofiles.open(jsonl_file_path, 'r', encoding='utf-8') as f:
async for line in f:
line = line.strip()
if line:
try:
comments_data.append(json.loads(line))
except json.JSONDecodeError:
continue
elif os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0:
async with aiofiles.open(json_file_path, 'r', encoding='utf-8') as f:
content = await f.read()
if content:
comments_data = json.loads(content)
if not isinstance(comments_data, list):
comments_data = [comments_data]
if not comments_data:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments data found")
return
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
content = await f.read()
if not content:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
return
comments_data = json.loads(content)
if not isinstance(comments_data, list):
comments_data = [comments_data]
# Filter comments data to only include 'content' field
# Handle different comment data structures across platforms
filtered_data = []