From 0282e626c94a8c0e42548dedf4475d90fdd3ec61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Tue, 3 Mar 2026 23:31:07 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=20JSONL=20=E5=AD=98?= =?UTF-8?q?=E5=82=A8=E6=A0=BC=E5=BC=8F=E6=94=AF=E6=8C=81=EF=BC=8C=E9=BB=98?= =?UTF-8?q?=E8=AE=A4=E5=AD=98=E5=82=A8=E6=A0=BC=E5=BC=8F=E6=94=B9=E4=B8=BA?= =?UTF-8?q?=20jsonl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入, 无需读取已有数据,大数据量下性能远优于 JSON 格式。 - 新增 AsyncFileWriter.write_to_jsonl() 核心方法 - 7 个平台新增 JsonlStoreImplement 类并注册到工厂 - 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新 - db_session.py 守卫条件加入 jsonl,避免误触 ValueError - 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json - 原有 json 选项完全保留,向后兼容 - 更新相关文档和测试 --- README.md | 2 +- README_en.md | 2 +- README_es.md | 2 +- api/main.py | 1 + api/schemas/crawler.py | 3 ++- cmd_arg/arg.py | 5 ++-- config/base_config.py | 4 +-- database/db_session.py | 2 +- docs/data_storage_guide.md | 4 +++ docs/excel_export_guide.md | 2 +- docs/词云图使用配置.md | 10 ++++---- docs/项目架构文档.md | 5 ++-- main.py | 2 +- store/bilibili/__init__.py | 1 + store/bilibili/_store_impl.py | 44 +++++++++++++++++++++++++++++--- store/douyin/__init__.py | 1 + store/douyin/_store_impl.py | 26 +++++++++++++++++++ store/kuaishou/__init__.py | 1 + store/kuaishou/_store_impl.py | 15 +++++++++++ store/tieba/__init__.py | 1 + store/tieba/_store_impl.py | 15 +++++++++++ store/weibo/__init__.py | 1 + store/weibo/_store_impl.py | 15 +++++++++++ store/xhs/__init__.py | 1 + store/xhs/_store_impl.py | 18 +++++++++++++ store/zhihu/__init__.py | 2 ++ store/zhihu/_store_impl.py | 15 +++++++++++ tests/test_store_factory.py | 48 ++++++++++++++++++++++++++--------- tools/async_file_writer.py | 44 ++++++++++++++++++++++---------- 29 files changed, 245 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index d937551..462617e 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ python main.py --help ## 💾 数据保存 -MediaCrawler 支持多种数据存储方式,包括 CSV、JSON、Excel、SQLite 和 MySQL 数据库。 +MediaCrawler 支持多种数据存储方式,包括 CSV、JSON、JSONL、Excel、SQLite 和 MySQL 数据库。 📖 **详细使用说明请查看:[数据存储指南](docs/data_storage_guide.md)** diff --git a/README_en.md b/README_en.md index 55b19fb..38e6511 100644 --- a/README_en.md +++ b/README_en.md @@ -221,7 +221,7 @@ python main.py --help ## 💾 Data Storage -MediaCrawler supports multiple data storage methods, including CSV, JSON, Excel, SQLite, and MySQL databases. +MediaCrawler supports multiple data storage methods, including CSV, JSON, JSONL, Excel, SQLite, and MySQL databases. 📖 **For detailed usage instructions, please see: [Data Storage Guide](docs/data_storage_guide.md)** diff --git a/README_es.md b/README_es.md index 98d59ec..5b4260a 100644 --- a/README_es.md +++ b/README_es.md @@ -221,7 +221,7 @@ python main.py --help ## 💾 Almacenamiento de Datos -MediaCrawler soporta múltiples métodos de almacenamiento de datos, incluyendo CSV, JSON, Excel, SQLite y bases de datos MySQL. +MediaCrawler soporta múltiples métodos de almacenamiento de datos, incluyendo CSV, JSON, JSONL, Excel, SQLite y bases de datos MySQL. 📖 **Para instrucciones de uso detalladas, por favor vea: [Guía de Almacenamiento de Datos](docs/data_storage_guide.md)** diff --git a/api/main.py b/api/main.py index 539b4cf..23e90f2 100644 --- a/api/main.py +++ b/api/main.py @@ -159,6 +159,7 @@ async def get_config_options(): {"value": "creator", "label": "Creator Mode"}, ], "save_options": [ + {"value": "jsonl", "label": "JSONL File"}, {"value": "json", "label": "JSON File"}, {"value": "csv", "label": "CSV File"}, {"value": "excel", "label": "Excel File"}, diff --git a/api/schemas/crawler.py b/api/schemas/crawler.py index 76538fb..f31ef82 100644 --- a/api/schemas/crawler.py +++ b/api/schemas/crawler.py @@ -51,6 +51,7 @@ class SaveDataOptionEnum(str, Enum): CSV = "csv" DB = "db" JSON = "json" + JSONL = "jsonl" SQLITE = "sqlite" MONGODB = "mongodb" EXCEL = "excel" @@ -67,7 +68,7 @@ class CrawlerStartRequest(BaseModel): start_page: int = 1 enable_comments: bool = True enable_sub_comments: bool = False - save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSON + save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL cookies: str = "" headless: bool = False diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index e27d45a..cbbcc03 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -70,6 +70,7 @@ class SaveDataOptionEnum(str, Enum): CSV = "csv" DB = "db" JSON = "json" + JSONL = "jsonl" SQLITE = "sqlite" MONGODB = "mongodb" EXCEL = "excel" @@ -212,11 +213,11 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): SaveDataOptionEnum, typer.Option( "--save_data_option", - help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file | postgres=PostgreSQL database)", + help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | jsonl=JSONL file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file | postgres=PostgreSQL database)", rich_help_panel="Storage Configuration", ), ] = _coerce_enum( - SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON + SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSONL ), init_db: Annotated[ Optional[InitDbOptionEnum], diff --git a/config/base_config.py b/config/base_config.py index c05657b..26ed383 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -70,8 +70,8 @@ BROWSER_LAUNCH_TIMEOUT = 60 # Set to False to keep the browser running for easy debugging AUTO_CLOSE_BROWSER = True -# Data saving type option configuration, supports six types: csv, db, json, sqlite, excel, postgres. It is best to save to DB, with deduplication function. -SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres +# Data saving type option configuration, supports: csv, db, json, jsonl, sqlite, excel, postgres. It is best to save to DB, with deduplication function. +SAVE_DATA_OPTION = "jsonl" # csv or db or json or jsonl or sqlite or excel or postgres # Data saving path, if not specified by default, it will be saved to the data folder. SAVE_DATA_PATH = "" diff --git a/database/db_session.py b/database/db_session.py index 2a8073c..d6c7360 100644 --- a/database/db_session.py +++ b/database/db_session.py @@ -57,7 +57,7 @@ def get_async_engine(db_type: str = None): if db_type in _engines: return _engines[db_type] - if db_type in ["json", "csv"]: + if db_type in ["json", "jsonl", "csv"]: return None if db_type == "sqlite": diff --git a/docs/data_storage_guide.md b/docs/data_storage_guide.md index f1accef..cab06f6 100644 --- a/docs/data_storage_guide.md +++ b/docs/data_storage_guide.md @@ -9,6 +9,7 @@ MediaCrawler 支持多种数据存储方式,您可以根据需求选择最适 - **CSV 文件**:支持保存到 CSV 中(`data/` 目录下) - **JSON 文件**:支持保存到 JSON 中(`data/` 目录下) +- **JSONL 文件**:支持保存到 JSONL 中(`data/` 目录下)— 默认格式,每行一个 JSON 对象,追加写入性能好 - **Excel 文件**:支持保存到格式化的 Excel 文件(`data/` 目录下)✨ 新功能 - 多工作表支持(内容、评论、创作者) - 专业格式化(标题样式、自动列宽、边框) @@ -57,6 +58,9 @@ uv run main.py --platform xhs --lt qrcode --type search --save_data_option csv # 使用 JSON 存储数据 uv run main.py --platform xhs --lt qrcode --type search --save_data_option json + +# 使用 JSONL 存储数据(默认格式,无需指定) +uv run main.py --platform xhs --lt qrcode --type search --save_data_option jsonl ``` #### 详细文档 diff --git a/docs/excel_export_guide.md b/docs/excel_export_guide.md index ce1e0c1..57e1433 100644 --- a/docs/excel_export_guide.md +++ b/docs/excel_export_guide.md @@ -34,7 +34,7 @@ pip install openpyxl 1. **Configure Excel export** in `config/base_config.py`: ```python -SAVE_DATA_OPTION = "excel" # Change from json/csv/db to excel +SAVE_DATA_OPTION = "excel" # Change from jsonl/json/csv/db to excel ``` 2. **Run the crawler**: diff --git a/docs/词云图使用配置.md b/docs/词云图使用配置.md index a0f84b0..733fd07 100644 --- a/docs/词云图使用配置.md +++ b/docs/词云图使用配置.md @@ -1,14 +1,14 @@ # 关于词云图相关操作 ## 1.如何正确调用词云图 -> ps:目前只有保存格式为json文件时,才会生成词云图。其他存储方式添加词云图将在近期添加。 +> ps:保存格式为json或jsonl文件时,才会生成词云图。其他存储方式添加词云图将在近期添加。 需要修改的配置项(./config/base_config.py): ```python -# 数据保存类型选项配置,支持三种类型:csv、db、json -#此处需要为json格式保存,原因如上 -SAVE_DATA_OPTION = "json" # csv or db or json +# 数据保存类型选项配置,支持多种类型:csv、db、json、jsonl等 +#此处需要为json或jsonl格式保存,原因如上 +SAVE_DATA_OPTION = "jsonl" # csv or db or json or jsonl ``` ```python @@ -54,4 +54,4 @@ FONT_PATH= "./docs/STZHONGS.TTF" ![image-20240627204928601](https://rosyrain.oss-cn-hangzhou.aliyuncs.com/img2/202406272049662.png) -如图,在data文件下的`words文件夹`下,其中json为词频统计文件,png为词云图。原本的评论内容在`json文件夹`下。 \ No newline at end of file +如图,在data文件下的`words文件夹`下,其中json为词频统计文件,png为词云图。原本的评论内容在`jsonl文件夹`(或`json文件夹`)下。 \ No newline at end of file diff --git a/docs/项目架构文档.md b/docs/项目架构文档.md index 661fac8..4a1c048 100644 --- a/docs/项目架构文档.md +++ b/docs/项目架构文档.md @@ -22,7 +22,7 @@ MediaCrawler 是一个多平台自媒体爬虫框架,采用 Python 异步编 - **多平台支持**:统一的爬虫接口,支持 7 大主流平台 - **多种登录方式**:二维码、手机号、Cookie 三种登录方式 -- **多种存储方式**:CSV、JSON、SQLite、MySQL、MongoDB、Excel +- **多种存储方式**:CSV、JSON、JSONL、SQLite、MySQL、MongoDB、Excel - **反爬虫对策**:CDP 模式、代理 IP 池、请求签名 - **异步高并发**:基于 asyncio 的异步架构,高效并发爬取 - **词云生成**:自动生成评论词云图 @@ -441,6 +441,7 @@ class DouyinStoreFactory: |---------|--------|-----|---------| | CSV | `csv` | 简单、通用 | 小规模数据、快速查看 | | JSON | `json` | 结构完整、易解析 | API对接、数据交换 | +| JSONL | `jsonl` | 追加写入、性能好 | 大规模数据、增量爬取(默认) | | SQLite | `sqlite` | 轻量、无需服务 | 本地开发、小型项目 | | MySQL | `db` | 性能好、支持并发 | 生产环境、大规模数据 | | MongoDB | `mongodb` | 灵活、易扩展 | 非结构化数据、快速迭代 | @@ -693,7 +694,7 @@ IP_PROXY_PROVIDER = "kuaidaili" IP_PROXY_POOL_COUNT = 2 # 存储配置 -SAVE_DATA_OPTION = "json" # csv, db, json, sqlite, mongodb, excel +SAVE_DATA_OPTION = "jsonl" # csv, db, json, jsonl, sqlite, mongodb, excel, postgres ``` ### 8.2 数据库配置 diff --git a/main.py b/main.py index 61e0593..a4a6f35 100644 --- a/main.py +++ b/main.py @@ -84,7 +84,7 @@ def _flush_excel_if_needed() -> None: async def _generate_wordcloud_if_needed() -> None: - if config.SAVE_DATA_OPTION != "json" or not config.ENABLE_GET_WORDCLOUD: + if config.SAVE_DATA_OPTION not in ("json", "jsonl") or not config.ENABLE_GET_WORDCLOUD: return try: diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index 2e6cc56..5419177 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -37,6 +37,7 @@ class BiliStoreFactory: "db": BiliDbStoreImplement, "postgres": BiliDbStoreImplement, "json": BiliJsonStoreImplement, + "jsonl": BiliJsonlStoreImplement, "sqlite": BiliSqliteStoreImplement, "mongodb": BiliMongoStoreImplement, "excel": BiliExcelStoreImplement, diff --git a/store/bilibili/_store_impl.py b/store/bilibili/_store_impl.py index 5ec8837..ed220b0 100644 --- a/store/bilibili/_store_impl.py +++ b/store/bilibili/_store_impl.py @@ -133,7 +133,7 @@ class BiliDbStoreImplement(AbstractStore): content_item["user_id"] = int(content_item.get("user_id", 0) or 0) content_item["liked_count"] = int(content_item.get("liked_count", 0) or 0) content_item["create_time"] = int(content_item.get("create_time", 0) or 0) - + async with get_session() as session: result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id)) video_detail = result.scalar_one_or_none() @@ -162,7 +162,7 @@ class BiliDbStoreImplement(AbstractStore): comment_item["like_count"] = str(comment_item.get("like_count", "0")) comment_item["sub_comment_count"] = str(comment_item.get("sub_comment_count", "0")) comment_item["parent_comment_id"] = str(comment_item.get("parent_comment_id", "0")) - + async with get_session() as session: result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id)) comment_detail = result.scalar_one_or_none() @@ -242,7 +242,7 @@ class BiliDbStoreImplement(AbstractStore): """ dynamic_id = int(dynamic_item.get("dynamic_id")) dynamic_item["dynamic_id"] = dynamic_id - + async with get_session() as session: result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id)) dynamic_detail = result.scalar_one_or_none() @@ -338,6 +338,44 @@ class BiliJsonStoreImplement(AbstractStore): +class BiliJsonlStoreImplement(AbstractStore): + def __init__(self): + self.file_writer = AsyncFileWriter( + crawler_type=crawler_type_var.get(), + platform="bili" + ) + + async def store_content(self, content_item: Dict): + await self.file_writer.write_to_jsonl( + item=content_item, + item_type="contents" + ) + + async def store_comment(self, comment_item: Dict): + await self.file_writer.write_to_jsonl( + item=comment_item, + item_type="comments" + ) + + async def store_creator(self, creator: Dict): + await self.file_writer.write_to_jsonl( + item=creator, + item_type="creators" + ) + + async def store_contact(self, contact_item: Dict): + await self.file_writer.write_to_jsonl( + item=contact_item, + item_type="contacts" + ) + + async def store_dynamic(self, dynamic_item: Dict): + await self.file_writer.write_to_jsonl( + item=dynamic_item, + item_type="dynamics" + ) + + class BiliSqliteStoreImplement(BiliDbStoreImplement): pass diff --git a/store/douyin/__init__.py b/store/douyin/__init__.py index cb8f774..9e4c21c 100644 --- a/store/douyin/__init__.py +++ b/store/douyin/__init__.py @@ -36,6 +36,7 @@ class DouyinStoreFactory: "db": DouyinDbStoreImplement, "postgres": DouyinDbStoreImplement, "json": DouyinJsonStoreImplement, + "jsonl": DouyinJsonlStoreImplement, "sqlite": DouyinSqliteStoreImplement, "mongodb": DouyinMongoStoreImplement, "excel": DouyinExcelStoreImplement, diff --git a/store/douyin/_store_impl.py b/store/douyin/_store_impl.py index 19022b0..93f7fa8 100644 --- a/store/douyin/_store_impl.py +++ b/store/douyin/_store_impl.py @@ -204,6 +204,32 @@ class DouyinJsonStoreImplement(AbstractStore): +class DouyinJsonlStoreImplement(AbstractStore): + def __init__(self): + self.file_writer = AsyncFileWriter( + crawler_type=crawler_type_var.get(), + platform="douyin" + ) + + async def store_content(self, content_item: Dict): + await self.file_writer.write_to_jsonl( + item=content_item, + item_type="contents" + ) + + async def store_comment(self, comment_item: Dict): + await self.file_writer.write_to_jsonl( + item=comment_item, + item_type="comments" + ) + + async def store_creator(self, creator: Dict): + await self.file_writer.write_to_jsonl( + item=creator, + item_type="creators" + ) + + class DouyinSqliteStoreImplement(DouyinDbStoreImplement): pass diff --git a/store/kuaishou/__init__.py b/store/kuaishou/__init__.py index e280eb3..bbd0981 100644 --- a/store/kuaishou/__init__.py +++ b/store/kuaishou/__init__.py @@ -36,6 +36,7 @@ class KuaishouStoreFactory: "db": KuaishouDbStoreImplement, "postgres": KuaishouDbStoreImplement, "json": KuaishouJsonStoreImplement, + "jsonl": KuaishouJsonlStoreImplement, "sqlite": KuaishouSqliteStoreImplement, "mongodb": KuaishouMongoStoreImplement, "excel": KuaishouExcelStoreImplement, diff --git a/store/kuaishou/_store_impl.py b/store/kuaishou/_store_impl.py index 9b5209b..8e44a01 100644 --- a/store/kuaishou/_store_impl.py +++ b/store/kuaishou/_store_impl.py @@ -167,6 +167,21 @@ class KuaishouJsonStoreImplement(AbstractStore): pass +class KuaishouJsonlStoreImplement(AbstractStore): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.writer = AsyncFileWriter(platform="kuaishou", crawler_type=crawler_type_var.get()) + + async def store_content(self, content_item: Dict): + await self.writer.write_to_jsonl(item_type="contents", item=content_item) + + async def store_comment(self, comment_item: Dict): + await self.writer.write_to_jsonl(item_type="comments", item=comment_item) + + async def store_creator(self, creator: Dict): + pass + + class KuaishouSqliteStoreImplement(KuaishouDbStoreImplement): async def store_creator(self, creator: Dict): pass diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 7850287..ca10320 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -33,6 +33,7 @@ class TieBaStoreFactory: "db": TieBaDbStoreImplement, "postgres": TieBaDbStoreImplement, "json": TieBaJsonStoreImplement, + "jsonl": TieBaJsonlStoreImplement, "sqlite": TieBaSqliteStoreImplement, "mongodb": TieBaMongoStoreImplement, "excel": TieBaExcelStoreImplement, diff --git a/store/tieba/_store_impl.py b/store/tieba/_store_impl.py index b4437ae..a259334 100644 --- a/store/tieba/_store_impl.py +++ b/store/tieba/_store_impl.py @@ -195,6 +195,21 @@ class TieBaJsonStoreImplement(AbstractStore): await self.writer.write_single_item_to_json(item_type="creators", item=creator) +class TieBaJsonlStoreImplement(AbstractStore): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.writer = AsyncFileWriter(platform="tieba", crawler_type=crawler_type_var.get()) + + async def store_content(self, content_item: Dict): + await self.writer.write_to_jsonl(item_type="contents", item=content_item) + + async def store_comment(self, comment_item: Dict): + await self.writer.write_to_jsonl(item_type="comments", item=comment_item) + + async def store_creator(self, creator: Dict): + await self.writer.write_to_jsonl(item_type="creators", item=creator) + + class TieBaSqliteStoreImplement(TieBaDbStoreImplement): """ Tieba sqlite store implement diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py index 3eb40af..bef0fe5 100644 --- a/store/weibo/__init__.py +++ b/store/weibo/__init__.py @@ -37,6 +37,7 @@ class WeibostoreFactory: "db": WeiboDbStoreImplement, "postgres": WeiboDbStoreImplement, "json": WeiboJsonStoreImplement, + "jsonl": WeiboJsonlStoreImplement, "sqlite": WeiboSqliteStoreImplement, "mongodb": WeiboMongoStoreImplement, "excel": WeiboExcelStoreImplement, diff --git a/store/weibo/_store_impl.py b/store/weibo/_store_impl.py index 44a0754..db75bf0 100644 --- a/store/weibo/_store_impl.py +++ b/store/weibo/_store_impl.py @@ -226,6 +226,21 @@ class WeiboJsonStoreImplement(AbstractStore): await self.writer.write_single_item_to_json(item_type="creators", item=creator) +class WeiboJsonlStoreImplement(AbstractStore): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.writer = AsyncFileWriter(platform="weibo", crawler_type=crawler_type_var.get()) + + async def store_content(self, content_item: Dict): + await self.writer.write_to_jsonl(item_type="contents", item=content_item) + + async def store_comment(self, comment_item: Dict): + await self.writer.write_to_jsonl(item_type="comments", item=comment_item) + + async def store_creator(self, creator: Dict): + await self.writer.write_to_jsonl(item_type="creators", item=creator) + + class WeiboSqliteStoreImplement(WeiboDbStoreImplement): """ Weibo content SQLite storage implementation diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index 32f4eaf..8796af4 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -36,6 +36,7 @@ class XhsStoreFactory: "db": XhsDbStoreImplement, "postgres": XhsDbStoreImplement, "json": XhsJsonStoreImplement, + "jsonl": XhsJsonlStoreImplement, "sqlite": XhsSqliteStoreImplement, "mongodb": XhsMongoStoreImplement, "excel": XhsExcelStoreImplement, diff --git a/store/xhs/_store_impl.py b/store/xhs/_store_impl.py index 5eac9af..ade031c 100644 --- a/store/xhs/_store_impl.py +++ b/store/xhs/_store_impl.py @@ -101,6 +101,24 @@ class XhsJsonStoreImplement(AbstractStore): +class XhsJsonlStoreImplement(AbstractStore): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.writer = AsyncFileWriter(platform="xhs", crawler_type=crawler_type_var.get()) + + async def store_content(self, content_item: Dict): + await self.writer.write_to_jsonl(item_type="contents", item=content_item) + + async def store_comment(self, comment_item: Dict): + await self.writer.write_to_jsonl(item_type="comments", item=comment_item) + + async def store_creator(self, creator_item: Dict): + pass + + def flush(self): + pass + + class XhsDbStoreImplement(AbstractStore): def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/store/zhihu/__init__.py b/store/zhihu/__init__.py index 593085d..5b74963 100644 --- a/store/zhihu/__init__.py +++ b/store/zhihu/__init__.py @@ -27,6 +27,7 @@ from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator from ._store_impl import (ZhihuCsvStoreImplement, ZhihuDbStoreImplement, ZhihuJsonStoreImplement, + ZhihuJsonlStoreImplement, ZhihuSqliteStoreImplement, ZhihuMongoStoreImplement, ZhihuExcelStoreImplement) @@ -40,6 +41,7 @@ class ZhihuStoreFactory: "db": ZhihuDbStoreImplement, "postgres": ZhihuDbStoreImplement, "json": ZhihuJsonStoreImplement, + "jsonl": ZhihuJsonlStoreImplement, "sqlite": ZhihuSqliteStoreImplement, "mongodb": ZhihuMongoStoreImplement, "excel": ZhihuExcelStoreImplement, diff --git a/store/zhihu/_store_impl.py b/store/zhihu/_store_impl.py index 6d029a4..a2ff628 100644 --- a/store/zhihu/_store_impl.py +++ b/store/zhihu/_store_impl.py @@ -203,6 +203,21 @@ class ZhihuJsonStoreImplement(AbstractStore): await self.writer.write_single_item_to_json(item_type="creators", item=creator) +class ZhihuJsonlStoreImplement(AbstractStore): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.writer = AsyncFileWriter(platform="zhihu", crawler_type=crawler_type_var.get()) + + async def store_content(self, content_item: Dict): + await self.writer.write_to_jsonl(item_type="contents", item=content_item) + + async def store_comment(self, comment_item: Dict): + await self.writer.write_to_jsonl(item_type="comments", item=comment_item) + + async def store_creator(self, creator: Dict): + await self.writer.write_to_jsonl(item_type="creators", item=creator) + + class ZhihuSqliteStoreImplement(ZhihuDbStoreImplement): """ Zhihu content SQLite storage implementation diff --git a/tests/test_store_factory.py b/tests/test_store_factory.py index ada123b..4c53b3e 100644 --- a/tests/test_store_factory.py +++ b/tests/test_store_factory.py @@ -1,4 +1,21 @@ # -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tests/test_store_factory.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + """ Unit tests for Store Factory functionality """ @@ -10,6 +27,7 @@ from store.xhs import XhsStoreFactory from store.xhs._store_impl import ( XhsCsvStoreImplement, XhsJsonStoreImplement, + XhsJsonlStoreImplement, XhsDbStoreImplement, XhsSqliteStoreImplement, XhsMongoStoreImplement, @@ -19,57 +37,63 @@ from store.xhs._store_impl import ( class TestXhsStoreFactory: """Test cases for XhsStoreFactory""" - + @patch('config.SAVE_DATA_OPTION', 'csv') def test_create_csv_store(self): """Test creating CSV store""" store = XhsStoreFactory.create_store() assert isinstance(store, XhsCsvStoreImplement) - + @patch('config.SAVE_DATA_OPTION', 'json') def test_create_json_store(self): """Test creating JSON store""" store = XhsStoreFactory.create_store() assert isinstance(store, XhsJsonStoreImplement) - + @patch('config.SAVE_DATA_OPTION', 'db') def test_create_db_store(self): """Test creating database store""" store = XhsStoreFactory.create_store() assert isinstance(store, XhsDbStoreImplement) - + @patch('config.SAVE_DATA_OPTION', 'sqlite') def test_create_sqlite_store(self): """Test creating SQLite store""" store = XhsStoreFactory.create_store() assert isinstance(store, XhsSqliteStoreImplement) - + @patch('config.SAVE_DATA_OPTION', 'mongodb') def test_create_mongodb_store(self): """Test creating MongoDB store""" store = XhsStoreFactory.create_store() assert isinstance(store, XhsMongoStoreImplement) - + @patch('config.SAVE_DATA_OPTION', 'excel') def test_create_excel_store(self): """Test creating Excel store""" # ContextVar cannot be mocked, so we test with actual value store = XhsStoreFactory.create_store() assert isinstance(store, XhsExcelStoreImplement) - + + @patch('config.SAVE_DATA_OPTION', 'jsonl') + def test_create_jsonl_store(self): + """Test creating JSONL store""" + store = XhsStoreFactory.create_store() + assert isinstance(store, XhsJsonlStoreImplement) + @patch('config.SAVE_DATA_OPTION', 'invalid') def test_invalid_store_option(self): """Test that invalid store option raises ValueError""" with pytest.raises(ValueError) as exc_info: XhsStoreFactory.create_store() - + assert "Invalid save option" in str(exc_info.value) - + def test_all_stores_registered(self): """Test that all store types are registered""" - expected_stores = ['csv', 'json', 'db', 'sqlite', 'mongodb', 'excel'] - + expected_stores = ['csv', 'json', 'jsonl', 'db', 'postgres', 'sqlite', 'mongodb', 'excel'] + for store_type in expected_stores: assert store_type in XhsStoreFactory.STORES - + assert len(XhsStoreFactory.STORES) == len(expected_stores) diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py index a52e284..d5397ad 100644 --- a/tools/async_file_writer.py +++ b/tools/async_file_writer.py @@ -53,6 +53,12 @@ class AsyncFileWriter: await writer.writeheader() await writer.writerow(item) + async def write_to_jsonl(self, item: Dict, item_type: str): + file_path = self._get_file_path('jsonl', item_type) + async with self.lock: + async with aiofiles.open(file_path, 'a', encoding='utf-8') as f: + await f.write(json.dumps(item, ensure_ascii=False) + '\n') + async def write_single_item_to_json(self, item: Dict, item_type: str): file_path = self._get_file_path('json', item_type) async with self.lock: @@ -85,22 +91,32 @@ class AsyncFileWriter: return try: - # Read comments from JSON file - comments_file_path = self._get_file_path('json', 'comments') - if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0: - utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}") + # Read comments from JSON or JSONL file + comments_data = [] + jsonl_file_path = self._get_file_path('jsonl', 'comments') + json_file_path = self._get_file_path('json', 'comments') + + if os.path.exists(jsonl_file_path) and os.path.getsize(jsonl_file_path) > 0: + async with aiofiles.open(jsonl_file_path, 'r', encoding='utf-8') as f: + async for line in f: + line = line.strip() + if line: + try: + comments_data.append(json.loads(line)) + except json.JSONDecodeError: + continue + elif os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0: + async with aiofiles.open(json_file_path, 'r', encoding='utf-8') as f: + content = await f.read() + if content: + comments_data = json.loads(content) + if not isinstance(comments_data, list): + comments_data = [comments_data] + + if not comments_data: + utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments data found") return - async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f: - content = await f.read() - if not content: - utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty") - return - - comments_data = json.loads(content) - if not isinstance(comments_data, list): - comments_data = [comments_data] - # Filter comments data to only include 'content' field # Handle different comment data structures across platforms filtered_data = []