feat: 新增 JSONL 存储格式支持，默认存储格式改为 jsonl

JSONL（JSON Lines）每行一个 JSON 对象，采用 append 模式写入，无需读取已有数据，大数据量下性能远优于 JSON 格式。 - 新增 AsyncFileWriter.write_to_jsonl() 核心方法 - 7 个平台新增 JsonlStoreImplement 类并注册到工厂 - 配置默认值从 json 改为 jsonl，CLI/API 枚举同步更新 - db_session.py 守卫条件加入 jsonl，避免误触 ValueError - 词云生成支持读取 JSONL 文件，优先 jsonl 回退 json - 原有 json 选项完全保留，向后兼容 - 更新相关文档和测试
2026-06-06 01:47:26 +08:00 · 2026-03-03 23:31:07 +08:00
parent 4331b91fe1
commit 0282e626c9
29 changed files with 245 additions and 47 deletions
--- a/tools/async_file_writer.py
+++ b/tools/async_file_writer.py
@@ -53,6 +53,12 @@ class AsyncFileWriter:
                    await writer.writeheader()
                await writer.writerow(item)

+    async def write_to_jsonl(self, item: Dict, item_type: str):
+        file_path = self._get_file_path('jsonl', item_type)
+        async with self.lock:
+            async with aiofiles.open(file_path, 'a', encoding='utf-8') as f:
+                await f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
    async def write_single_item_to_json(self, item: Dict, item_type: str):
        file_path = self._get_file_path('json', item_type)
        async with self.lock:
@@ -85,22 +91,32 @@ class AsyncFileWriter:
            return

        try:
-            # Read comments from JSON file
-            comments_file_path = self._get_file_path('json', 'comments')
-            if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
-                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
+            # Read comments from JSON or JSONL file
+            comments_data = []
+            jsonl_file_path = self._get_file_path('jsonl', 'comments')
+            json_file_path = self._get_file_path('json', 'comments')
+
+            if os.path.exists(jsonl_file_path) and os.path.getsize(jsonl_file_path) > 0:
+                async with aiofiles.open(jsonl_file_path, 'r', encoding='utf-8') as f:
+                    async for line in f:
+                        line = line.strip()
+                        if line:
+                            try:
+                                comments_data.append(json.loads(line))
+                            except json.JSONDecodeError:
+                                continue
+            elif os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0:
+                async with aiofiles.open(json_file_path, 'r', encoding='utf-8') as f:
+                    content = await f.read()
+                    if content:
+                        comments_data = json.loads(content)
+                        if not isinstance(comments_data, list):
+                            comments_data = [comments_data]
+
+            if not comments_data:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments data found")
                return

-            async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
-                content = await f.read()
-                if not content:
-                    utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
-                    return
-
-                comments_data = json.loads(content)
-                if not isinstance(comments_data, list):
-                    comments_data = [comments_data]
-
            # Filter comments data to only include 'content' field
            # Handle different comment data structures across platforms
            filtered_data = []