mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-08 10:57:26 +08:00
feat: 新增 JSONL 存储格式支持,默认存储格式改为 jsonl
JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入, 无需读取已有数据,大数据量下性能远优于 JSON 格式。 - 新增 AsyncFileWriter.write_to_jsonl() 核心方法 - 7 个平台新增 JsonlStoreImplement 类并注册到工厂 - 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新 - db_session.py 守卫条件加入 jsonl,避免误触 ValueError - 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json - 原有 json 选项完全保留,向后兼容 - 更新相关文档和测试
This commit is contained in:
@@ -37,6 +37,7 @@ class BiliStoreFactory:
|
||||
"db": BiliDbStoreImplement,
|
||||
"postgres": BiliDbStoreImplement,
|
||||
"json": BiliJsonStoreImplement,
|
||||
"jsonl": BiliJsonlStoreImplement,
|
||||
"sqlite": BiliSqliteStoreImplement,
|
||||
"mongodb": BiliMongoStoreImplement,
|
||||
"excel": BiliExcelStoreImplement,
|
||||
|
||||
@@ -133,7 +133,7 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
content_item["user_id"] = int(content_item.get("user_id", 0) or 0)
|
||||
content_item["liked_count"] = int(content_item.get("liked_count", 0) or 0)
|
||||
content_item["create_time"] = int(content_item.get("create_time", 0) or 0)
|
||||
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id))
|
||||
video_detail = result.scalar_one_or_none()
|
||||
@@ -162,7 +162,7 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
comment_item["like_count"] = str(comment_item.get("like_count", "0"))
|
||||
comment_item["sub_comment_count"] = str(comment_item.get("sub_comment_count", "0"))
|
||||
comment_item["parent_comment_id"] = str(comment_item.get("parent_comment_id", "0"))
|
||||
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id))
|
||||
comment_detail = result.scalar_one_or_none()
|
||||
@@ -242,7 +242,7 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
"""
|
||||
dynamic_id = int(dynamic_item.get("dynamic_id"))
|
||||
dynamic_item["dynamic_id"] = dynamic_id
|
||||
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id))
|
||||
dynamic_detail = result.scalar_one_or_none()
|
||||
@@ -338,6 +338,44 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
|
||||
class BiliJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self):
|
||||
self.file_writer = AsyncFileWriter(
|
||||
crawler_type=crawler_type_var.get(),
|
||||
platform="bili"
|
||||
)
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=content_item,
|
||||
item_type="contents"
|
||||
)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=comment_item,
|
||||
item_type="comments"
|
||||
)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=creator,
|
||||
item_type="creators"
|
||||
)
|
||||
|
||||
async def store_contact(self, contact_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=contact_item,
|
||||
item_type="contacts"
|
||||
)
|
||||
|
||||
async def store_dynamic(self, dynamic_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=dynamic_item,
|
||||
item_type="dynamics"
|
||||
)
|
||||
|
||||
|
||||
class BiliSqliteStoreImplement(BiliDbStoreImplement):
|
||||
pass
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ class DouyinStoreFactory:
|
||||
"db": DouyinDbStoreImplement,
|
||||
"postgres": DouyinDbStoreImplement,
|
||||
"json": DouyinJsonStoreImplement,
|
||||
"jsonl": DouyinJsonlStoreImplement,
|
||||
"sqlite": DouyinSqliteStoreImplement,
|
||||
"mongodb": DouyinMongoStoreImplement,
|
||||
"excel": DouyinExcelStoreImplement,
|
||||
|
||||
@@ -204,6 +204,32 @@ class DouyinJsonStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
|
||||
class DouyinJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self):
|
||||
self.file_writer = AsyncFileWriter(
|
||||
crawler_type=crawler_type_var.get(),
|
||||
platform="douyin"
|
||||
)
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=content_item,
|
||||
item_type="contents"
|
||||
)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=comment_item,
|
||||
item_type="comments"
|
||||
)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=creator,
|
||||
item_type="creators"
|
||||
)
|
||||
|
||||
|
||||
class DouyinSqliteStoreImplement(DouyinDbStoreImplement):
|
||||
pass
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ class KuaishouStoreFactory:
|
||||
"db": KuaishouDbStoreImplement,
|
||||
"postgres": KuaishouDbStoreImplement,
|
||||
"json": KuaishouJsonStoreImplement,
|
||||
"jsonl": KuaishouJsonlStoreImplement,
|
||||
"sqlite": KuaishouSqliteStoreImplement,
|
||||
"mongodb": KuaishouMongoStoreImplement,
|
||||
"excel": KuaishouExcelStoreImplement,
|
||||
|
||||
@@ -167,6 +167,21 @@ class KuaishouJsonStoreImplement(AbstractStore):
|
||||
pass
|
||||
|
||||
|
||||
class KuaishouJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="kuaishou", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class KuaishouSqliteStoreImplement(KuaishouDbStoreImplement):
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
@@ -33,6 +33,7 @@ class TieBaStoreFactory:
|
||||
"db": TieBaDbStoreImplement,
|
||||
"postgres": TieBaDbStoreImplement,
|
||||
"json": TieBaJsonStoreImplement,
|
||||
"jsonl": TieBaJsonlStoreImplement,
|
||||
"sqlite": TieBaSqliteStoreImplement,
|
||||
"mongodb": TieBaMongoStoreImplement,
|
||||
"excel": TieBaExcelStoreImplement,
|
||||
|
||||
@@ -195,6 +195,21 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class TieBaJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="tieba", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class TieBaSqliteStoreImplement(TieBaDbStoreImplement):
|
||||
"""
|
||||
Tieba sqlite store implement
|
||||
|
||||
@@ -37,6 +37,7 @@ class WeibostoreFactory:
|
||||
"db": WeiboDbStoreImplement,
|
||||
"postgres": WeiboDbStoreImplement,
|
||||
"json": WeiboJsonStoreImplement,
|
||||
"jsonl": WeiboJsonlStoreImplement,
|
||||
"sqlite": WeiboSqliteStoreImplement,
|
||||
"mongodb": WeiboMongoStoreImplement,
|
||||
"excel": WeiboExcelStoreImplement,
|
||||
|
||||
@@ -226,6 +226,21 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class WeiboJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="weibo", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class WeiboSqliteStoreImplement(WeiboDbStoreImplement):
|
||||
"""
|
||||
Weibo content SQLite storage implementation
|
||||
|
||||
@@ -36,6 +36,7 @@ class XhsStoreFactory:
|
||||
"db": XhsDbStoreImplement,
|
||||
"postgres": XhsDbStoreImplement,
|
||||
"json": XhsJsonStoreImplement,
|
||||
"jsonl": XhsJsonlStoreImplement,
|
||||
"sqlite": XhsSqliteStoreImplement,
|
||||
"mongodb": XhsMongoStoreImplement,
|
||||
"excel": XhsExcelStoreImplement,
|
||||
|
||||
@@ -101,6 +101,24 @@ class XhsJsonStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
|
||||
class XhsJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="xhs", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
pass
|
||||
|
||||
def flush(self):
|
||||
pass
|
||||
|
||||
|
||||
class XhsDbStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -27,6 +27,7 @@ from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from ._store_impl import (ZhihuCsvStoreImplement,
|
||||
ZhihuDbStoreImplement,
|
||||
ZhihuJsonStoreImplement,
|
||||
ZhihuJsonlStoreImplement,
|
||||
ZhihuSqliteStoreImplement,
|
||||
ZhihuMongoStoreImplement,
|
||||
ZhihuExcelStoreImplement)
|
||||
@@ -40,6 +41,7 @@ class ZhihuStoreFactory:
|
||||
"db": ZhihuDbStoreImplement,
|
||||
"postgres": ZhihuDbStoreImplement,
|
||||
"json": ZhihuJsonStoreImplement,
|
||||
"jsonl": ZhihuJsonlStoreImplement,
|
||||
"sqlite": ZhihuSqliteStoreImplement,
|
||||
"mongodb": ZhihuMongoStoreImplement,
|
||||
"excel": ZhihuExcelStoreImplement,
|
||||
|
||||
@@ -203,6 +203,21 @@ class ZhihuJsonStoreImplement(AbstractStore):
|
||||
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class ZhihuJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="zhihu", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class ZhihuSqliteStoreImplement(ZhihuDbStoreImplement):
|
||||
"""
|
||||
Zhihu content SQLite storage implementation
|
||||
|
||||
Reference in New Issue
Block a user