mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-06 01:47:26 +08:00
feat: 新增 JSONL 存储格式支持,默认存储格式改为 jsonl
JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入, 无需读取已有数据,大数据量下性能远优于 JSON 格式。 - 新增 AsyncFileWriter.write_to_jsonl() 核心方法 - 7 个平台新增 JsonlStoreImplement 类并注册到工厂 - 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新 - db_session.py 守卫条件加入 jsonl,避免误触 ValueError - 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json - 原有 json 选项完全保留,向后兼容 - 更新相关文档和测试
This commit is contained in:
@@ -53,6 +53,12 @@ class AsyncFileWriter:
|
||||
await writer.writeheader()
|
||||
await writer.writerow(item)
|
||||
|
||||
async def write_to_jsonl(self, item: Dict, item_type: str):
|
||||
file_path = self._get_file_path('jsonl', item_type)
|
||||
async with self.lock:
|
||||
async with aiofiles.open(file_path, 'a', encoding='utf-8') as f:
|
||||
await f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
async def write_single_item_to_json(self, item: Dict, item_type: str):
|
||||
file_path = self._get_file_path('json', item_type)
|
||||
async with self.lock:
|
||||
@@ -85,22 +91,32 @@ class AsyncFileWriter:
|
||||
return
|
||||
|
||||
try:
|
||||
# Read comments from JSON file
|
||||
comments_file_path = self._get_file_path('json', 'comments')
|
||||
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
|
||||
# Read comments from JSON or JSONL file
|
||||
comments_data = []
|
||||
jsonl_file_path = self._get_file_path('jsonl', 'comments')
|
||||
json_file_path = self._get_file_path('json', 'comments')
|
||||
|
||||
if os.path.exists(jsonl_file_path) and os.path.getsize(jsonl_file_path) > 0:
|
||||
async with aiofiles.open(jsonl_file_path, 'r', encoding='utf-8') as f:
|
||||
async for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
comments_data.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
elif os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0:
|
||||
async with aiofiles.open(json_file_path, 'r', encoding='utf-8') as f:
|
||||
content = await f.read()
|
||||
if content:
|
||||
comments_data = json.loads(content)
|
||||
if not isinstance(comments_data, list):
|
||||
comments_data = [comments_data]
|
||||
|
||||
if not comments_data:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments data found")
|
||||
return
|
||||
|
||||
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
|
||||
content = await f.read()
|
||||
if not content:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
|
||||
return
|
||||
|
||||
comments_data = json.loads(content)
|
||||
if not isinstance(comments_data, list):
|
||||
comments_data = [comments_data]
|
||||
|
||||
# Filter comments data to only include 'content' field
|
||||
# Handle different comment data structures across platforms
|
||||
filtered_data = []
|
||||
|
||||
Reference in New Issue
Block a user