mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-04 05:00:47 +08:00
feat: 新增 JSONL 存储格式支持,默认存储格式改为 jsonl
JSONL(JSON Lines)每行一个 JSON 对象,采用 append 模式写入, 无需读取已有数据,大数据量下性能远优于 JSON 格式。 - 新增 AsyncFileWriter.write_to_jsonl() 核心方法 - 7 个平台新增 JsonlStoreImplement 类并注册到工厂 - 配置默认值从 json 改为 jsonl,CLI/API 枚举同步更新 - db_session.py 守卫条件加入 jsonl,避免误触 ValueError - 词云生成支持读取 JSONL 文件,优先 jsonl 回退 json - 原有 json 选项完全保留,向后兼容 - 更新相关文档和测试
This commit is contained in:
@@ -222,7 +222,7 @@ python main.py --help
|
||||
|
||||
## 💾 数据保存
|
||||
|
||||
MediaCrawler 支持多种数据存储方式,包括 CSV、JSON、Excel、SQLite 和 MySQL 数据库。
|
||||
MediaCrawler 支持多种数据存储方式,包括 CSV、JSON、JSONL、Excel、SQLite 和 MySQL 数据库。
|
||||
|
||||
📖 **详细使用说明请查看:[数据存储指南](docs/data_storage_guide.md)**
|
||||
|
||||
|
||||
@@ -221,7 +221,7 @@ python main.py --help
|
||||
|
||||
## 💾 Data Storage
|
||||
|
||||
MediaCrawler supports multiple data storage methods, including CSV, JSON, Excel, SQLite, and MySQL databases.
|
||||
MediaCrawler supports multiple data storage methods, including CSV, JSON, JSONL, Excel, SQLite, and MySQL databases.
|
||||
|
||||
📖 **For detailed usage instructions, please see: [Data Storage Guide](docs/data_storage_guide.md)**
|
||||
|
||||
|
||||
@@ -221,7 +221,7 @@ python main.py --help
|
||||
|
||||
## 💾 Almacenamiento de Datos
|
||||
|
||||
MediaCrawler soporta múltiples métodos de almacenamiento de datos, incluyendo CSV, JSON, Excel, SQLite y bases de datos MySQL.
|
||||
MediaCrawler soporta múltiples métodos de almacenamiento de datos, incluyendo CSV, JSON, JSONL, Excel, SQLite y bases de datos MySQL.
|
||||
|
||||
📖 **Para instrucciones de uso detalladas, por favor vea: [Guía de Almacenamiento de Datos](docs/data_storage_guide.md)**
|
||||
|
||||
|
||||
@@ -159,6 +159,7 @@ async def get_config_options():
|
||||
{"value": "creator", "label": "Creator Mode"},
|
||||
],
|
||||
"save_options": [
|
||||
{"value": "jsonl", "label": "JSONL File"},
|
||||
{"value": "json", "label": "JSON File"},
|
||||
{"value": "csv", "label": "CSV File"},
|
||||
{"value": "excel", "label": "Excel File"},
|
||||
|
||||
@@ -51,6 +51,7 @@ class SaveDataOptionEnum(str, Enum):
|
||||
CSV = "csv"
|
||||
DB = "db"
|
||||
JSON = "json"
|
||||
JSONL = "jsonl"
|
||||
SQLITE = "sqlite"
|
||||
MONGODB = "mongodb"
|
||||
EXCEL = "excel"
|
||||
@@ -67,7 +68,7 @@ class CrawlerStartRequest(BaseModel):
|
||||
start_page: int = 1
|
||||
enable_comments: bool = True
|
||||
enable_sub_comments: bool = False
|
||||
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSON
|
||||
save_option: SaveDataOptionEnum = SaveDataOptionEnum.JSONL
|
||||
cookies: str = ""
|
||||
headless: bool = False
|
||||
|
||||
|
||||
@@ -70,6 +70,7 @@ class SaveDataOptionEnum(str, Enum):
|
||||
CSV = "csv"
|
||||
DB = "db"
|
||||
JSON = "json"
|
||||
JSONL = "jsonl"
|
||||
SQLITE = "sqlite"
|
||||
MONGODB = "mongodb"
|
||||
EXCEL = "excel"
|
||||
@@ -212,11 +213,11 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
SaveDataOptionEnum,
|
||||
typer.Option(
|
||||
"--save_data_option",
|
||||
help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file | postgres=PostgreSQL database)",
|
||||
help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | jsonl=JSONL file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file | postgres=PostgreSQL database)",
|
||||
rich_help_panel="Storage Configuration",
|
||||
),
|
||||
] = _coerce_enum(
|
||||
SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
|
||||
SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSONL
|
||||
),
|
||||
init_db: Annotated[
|
||||
Optional[InitDbOptionEnum],
|
||||
|
||||
@@ -70,8 +70,8 @@ BROWSER_LAUNCH_TIMEOUT = 60
|
||||
# Set to False to keep the browser running for easy debugging
|
||||
AUTO_CLOSE_BROWSER = True
|
||||
|
||||
# Data saving type option configuration, supports six types: csv, db, json, sqlite, excel, postgres. It is best to save to DB, with deduplication function.
|
||||
SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres
|
||||
# Data saving type option configuration, supports: csv, db, json, jsonl, sqlite, excel, postgres. It is best to save to DB, with deduplication function.
|
||||
SAVE_DATA_OPTION = "jsonl" # csv or db or json or jsonl or sqlite or excel or postgres
|
||||
|
||||
# Data saving path, if not specified by default, it will be saved to the data folder.
|
||||
SAVE_DATA_PATH = ""
|
||||
|
||||
@@ -57,7 +57,7 @@ def get_async_engine(db_type: str = None):
|
||||
if db_type in _engines:
|
||||
return _engines[db_type]
|
||||
|
||||
if db_type in ["json", "csv"]:
|
||||
if db_type in ["json", "jsonl", "csv"]:
|
||||
return None
|
||||
|
||||
if db_type == "sqlite":
|
||||
|
||||
@@ -9,6 +9,7 @@ MediaCrawler 支持多种数据存储方式,您可以根据需求选择最适
|
||||
|
||||
- **CSV 文件**:支持保存到 CSV 中(`data/` 目录下)
|
||||
- **JSON 文件**:支持保存到 JSON 中(`data/` 目录下)
|
||||
- **JSONL 文件**:支持保存到 JSONL 中(`data/` 目录下)— 默认格式,每行一个 JSON 对象,追加写入性能好
|
||||
- **Excel 文件**:支持保存到格式化的 Excel 文件(`data/` 目录下)✨ 新功能
|
||||
- 多工作表支持(内容、评论、创作者)
|
||||
- 专业格式化(标题样式、自动列宽、边框)
|
||||
@@ -57,6 +58,9 @@ uv run main.py --platform xhs --lt qrcode --type search --save_data_option csv
|
||||
|
||||
# 使用 JSON 存储数据
|
||||
uv run main.py --platform xhs --lt qrcode --type search --save_data_option json
|
||||
|
||||
# 使用 JSONL 存储数据(默认格式,无需指定)
|
||||
uv run main.py --platform xhs --lt qrcode --type search --save_data_option jsonl
|
||||
```
|
||||
|
||||
#### 详细文档
|
||||
|
||||
@@ -34,7 +34,7 @@ pip install openpyxl
|
||||
1. **Configure Excel export** in `config/base_config.py`:
|
||||
|
||||
```python
|
||||
SAVE_DATA_OPTION = "excel" # Change from json/csv/db to excel
|
||||
SAVE_DATA_OPTION = "excel" # Change from jsonl/json/csv/db to excel
|
||||
```
|
||||
|
||||
2. **Run the crawler**:
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
# 关于词云图相关操作
|
||||
|
||||
## 1.如何正确调用词云图
|
||||
> ps:目前只有保存格式为json文件时,才会生成词云图。其他存储方式添加词云图将在近期添加。
|
||||
> ps:保存格式为json或jsonl文件时,才会生成词云图。其他存储方式添加词云图将在近期添加。
|
||||
|
||||
需要修改的配置项(./config/base_config.py):
|
||||
|
||||
```python
|
||||
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
||||
#此处需要为json格式保存,原因如上
|
||||
SAVE_DATA_OPTION = "json" # csv or db or json
|
||||
# 数据保存类型选项配置,支持多种类型:csv、db、json、jsonl等
|
||||
#此处需要为json或jsonl格式保存,原因如上
|
||||
SAVE_DATA_OPTION = "jsonl" # csv or db or json or jsonl
|
||||
```
|
||||
|
||||
```python
|
||||
@@ -54,4 +54,4 @@ FONT_PATH= "./docs/STZHONGS.TTF"
|
||||
|
||||

|
||||
|
||||
如图,在data文件下的`words文件夹`下,其中json为词频统计文件,png为词云图。原本的评论内容在`json文件夹`下。
|
||||
如图,在data文件下的`words文件夹`下,其中json为词频统计文件,png为词云图。原本的评论内容在`jsonl文件夹`(或`json文件夹`)下。
|
||||
@@ -22,7 +22,7 @@ MediaCrawler 是一个多平台自媒体爬虫框架,采用 Python 异步编
|
||||
|
||||
- **多平台支持**:统一的爬虫接口,支持 7 大主流平台
|
||||
- **多种登录方式**:二维码、手机号、Cookie 三种登录方式
|
||||
- **多种存储方式**:CSV、JSON、SQLite、MySQL、MongoDB、Excel
|
||||
- **多种存储方式**:CSV、JSON、JSONL、SQLite、MySQL、MongoDB、Excel
|
||||
- **反爬虫对策**:CDP 模式、代理 IP 池、请求签名
|
||||
- **异步高并发**:基于 asyncio 的异步架构,高效并发爬取
|
||||
- **词云生成**:自动生成评论词云图
|
||||
@@ -441,6 +441,7 @@ class DouyinStoreFactory:
|
||||
|---------|--------|-----|---------|
|
||||
| CSV | `csv` | 简单、通用 | 小规模数据、快速查看 |
|
||||
| JSON | `json` | 结构完整、易解析 | API对接、数据交换 |
|
||||
| JSONL | `jsonl` | 追加写入、性能好 | 大规模数据、增量爬取(默认) |
|
||||
| SQLite | `sqlite` | 轻量、无需服务 | 本地开发、小型项目 |
|
||||
| MySQL | `db` | 性能好、支持并发 | 生产环境、大规模数据 |
|
||||
| MongoDB | `mongodb` | 灵活、易扩展 | 非结构化数据、快速迭代 |
|
||||
@@ -693,7 +694,7 @@ IP_PROXY_PROVIDER = "kuaidaili"
|
||||
IP_PROXY_POOL_COUNT = 2
|
||||
|
||||
# 存储配置
|
||||
SAVE_DATA_OPTION = "json" # csv, db, json, sqlite, mongodb, excel
|
||||
SAVE_DATA_OPTION = "jsonl" # csv, db, json, jsonl, sqlite, mongodb, excel, postgres
|
||||
```
|
||||
|
||||
### 8.2 数据库配置
|
||||
|
||||
2
main.py
2
main.py
@@ -84,7 +84,7 @@ def _flush_excel_if_needed() -> None:
|
||||
|
||||
|
||||
async def _generate_wordcloud_if_needed() -> None:
|
||||
if config.SAVE_DATA_OPTION != "json" or not config.ENABLE_GET_WORDCLOUD:
|
||||
if config.SAVE_DATA_OPTION not in ("json", "jsonl") or not config.ENABLE_GET_WORDCLOUD:
|
||||
return
|
||||
|
||||
try:
|
||||
|
||||
@@ -37,6 +37,7 @@ class BiliStoreFactory:
|
||||
"db": BiliDbStoreImplement,
|
||||
"postgres": BiliDbStoreImplement,
|
||||
"json": BiliJsonStoreImplement,
|
||||
"jsonl": BiliJsonlStoreImplement,
|
||||
"sqlite": BiliSqliteStoreImplement,
|
||||
"mongodb": BiliMongoStoreImplement,
|
||||
"excel": BiliExcelStoreImplement,
|
||||
|
||||
@@ -133,7 +133,7 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
content_item["user_id"] = int(content_item.get("user_id", 0) or 0)
|
||||
content_item["liked_count"] = int(content_item.get("liked_count", 0) or 0)
|
||||
content_item["create_time"] = int(content_item.get("create_time", 0) or 0)
|
||||
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id))
|
||||
video_detail = result.scalar_one_or_none()
|
||||
@@ -162,7 +162,7 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
comment_item["like_count"] = str(comment_item.get("like_count", "0"))
|
||||
comment_item["sub_comment_count"] = str(comment_item.get("sub_comment_count", "0"))
|
||||
comment_item["parent_comment_id"] = str(comment_item.get("parent_comment_id", "0"))
|
||||
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id))
|
||||
comment_detail = result.scalar_one_or_none()
|
||||
@@ -242,7 +242,7 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
"""
|
||||
dynamic_id = int(dynamic_item.get("dynamic_id"))
|
||||
dynamic_item["dynamic_id"] = dynamic_id
|
||||
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id))
|
||||
dynamic_detail = result.scalar_one_or_none()
|
||||
@@ -338,6 +338,44 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
|
||||
class BiliJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self):
|
||||
self.file_writer = AsyncFileWriter(
|
||||
crawler_type=crawler_type_var.get(),
|
||||
platform="bili"
|
||||
)
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=content_item,
|
||||
item_type="contents"
|
||||
)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=comment_item,
|
||||
item_type="comments"
|
||||
)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=creator,
|
||||
item_type="creators"
|
||||
)
|
||||
|
||||
async def store_contact(self, contact_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=contact_item,
|
||||
item_type="contacts"
|
||||
)
|
||||
|
||||
async def store_dynamic(self, dynamic_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=dynamic_item,
|
||||
item_type="dynamics"
|
||||
)
|
||||
|
||||
|
||||
class BiliSqliteStoreImplement(BiliDbStoreImplement):
|
||||
pass
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ class DouyinStoreFactory:
|
||||
"db": DouyinDbStoreImplement,
|
||||
"postgres": DouyinDbStoreImplement,
|
||||
"json": DouyinJsonStoreImplement,
|
||||
"jsonl": DouyinJsonlStoreImplement,
|
||||
"sqlite": DouyinSqliteStoreImplement,
|
||||
"mongodb": DouyinMongoStoreImplement,
|
||||
"excel": DouyinExcelStoreImplement,
|
||||
|
||||
@@ -204,6 +204,32 @@ class DouyinJsonStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
|
||||
class DouyinJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self):
|
||||
self.file_writer = AsyncFileWriter(
|
||||
crawler_type=crawler_type_var.get(),
|
||||
platform="douyin"
|
||||
)
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=content_item,
|
||||
item_type="contents"
|
||||
)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=comment_item,
|
||||
item_type="comments"
|
||||
)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.file_writer.write_to_jsonl(
|
||||
item=creator,
|
||||
item_type="creators"
|
||||
)
|
||||
|
||||
|
||||
class DouyinSqliteStoreImplement(DouyinDbStoreImplement):
|
||||
pass
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ class KuaishouStoreFactory:
|
||||
"db": KuaishouDbStoreImplement,
|
||||
"postgres": KuaishouDbStoreImplement,
|
||||
"json": KuaishouJsonStoreImplement,
|
||||
"jsonl": KuaishouJsonlStoreImplement,
|
||||
"sqlite": KuaishouSqliteStoreImplement,
|
||||
"mongodb": KuaishouMongoStoreImplement,
|
||||
"excel": KuaishouExcelStoreImplement,
|
||||
|
||||
@@ -167,6 +167,21 @@ class KuaishouJsonStoreImplement(AbstractStore):
|
||||
pass
|
||||
|
||||
|
||||
class KuaishouJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="kuaishou", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class KuaishouSqliteStoreImplement(KuaishouDbStoreImplement):
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
@@ -33,6 +33,7 @@ class TieBaStoreFactory:
|
||||
"db": TieBaDbStoreImplement,
|
||||
"postgres": TieBaDbStoreImplement,
|
||||
"json": TieBaJsonStoreImplement,
|
||||
"jsonl": TieBaJsonlStoreImplement,
|
||||
"sqlite": TieBaSqliteStoreImplement,
|
||||
"mongodb": TieBaMongoStoreImplement,
|
||||
"excel": TieBaExcelStoreImplement,
|
||||
|
||||
@@ -195,6 +195,21 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class TieBaJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="tieba", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class TieBaSqliteStoreImplement(TieBaDbStoreImplement):
|
||||
"""
|
||||
Tieba sqlite store implement
|
||||
|
||||
@@ -37,6 +37,7 @@ class WeibostoreFactory:
|
||||
"db": WeiboDbStoreImplement,
|
||||
"postgres": WeiboDbStoreImplement,
|
||||
"json": WeiboJsonStoreImplement,
|
||||
"jsonl": WeiboJsonlStoreImplement,
|
||||
"sqlite": WeiboSqliteStoreImplement,
|
||||
"mongodb": WeiboMongoStoreImplement,
|
||||
"excel": WeiboExcelStoreImplement,
|
||||
|
||||
@@ -226,6 +226,21 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class WeiboJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="weibo", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class WeiboSqliteStoreImplement(WeiboDbStoreImplement):
|
||||
"""
|
||||
Weibo content SQLite storage implementation
|
||||
|
||||
@@ -36,6 +36,7 @@ class XhsStoreFactory:
|
||||
"db": XhsDbStoreImplement,
|
||||
"postgres": XhsDbStoreImplement,
|
||||
"json": XhsJsonStoreImplement,
|
||||
"jsonl": XhsJsonlStoreImplement,
|
||||
"sqlite": XhsSqliteStoreImplement,
|
||||
"mongodb": XhsMongoStoreImplement,
|
||||
"excel": XhsExcelStoreImplement,
|
||||
|
||||
@@ -101,6 +101,24 @@ class XhsJsonStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
|
||||
class XhsJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="xhs", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator_item: Dict):
|
||||
pass
|
||||
|
||||
def flush(self):
|
||||
pass
|
||||
|
||||
|
||||
class XhsDbStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -27,6 +27,7 @@ from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from ._store_impl import (ZhihuCsvStoreImplement,
|
||||
ZhihuDbStoreImplement,
|
||||
ZhihuJsonStoreImplement,
|
||||
ZhihuJsonlStoreImplement,
|
||||
ZhihuSqliteStoreImplement,
|
||||
ZhihuMongoStoreImplement,
|
||||
ZhihuExcelStoreImplement)
|
||||
@@ -40,6 +41,7 @@ class ZhihuStoreFactory:
|
||||
"db": ZhihuDbStoreImplement,
|
||||
"postgres": ZhihuDbStoreImplement,
|
||||
"json": ZhihuJsonStoreImplement,
|
||||
"jsonl": ZhihuJsonlStoreImplement,
|
||||
"sqlite": ZhihuSqliteStoreImplement,
|
||||
"mongodb": ZhihuMongoStoreImplement,
|
||||
"excel": ZhihuExcelStoreImplement,
|
||||
|
||||
@@ -203,6 +203,21 @@ class ZhihuJsonStoreImplement(AbstractStore):
|
||||
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class ZhihuJsonlStoreImplement(AbstractStore):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.writer = AsyncFileWriter(platform="zhihu", crawler_type=crawler_type_var.get())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="contents", item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="comments", item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
await self.writer.write_to_jsonl(item_type="creators", item=creator)
|
||||
|
||||
|
||||
class ZhihuSqliteStoreImplement(ZhihuDbStoreImplement):
|
||||
"""
|
||||
Zhihu content SQLite storage implementation
|
||||
|
||||
@@ -1,4 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) 2025 relakkes@gmail.com
|
||||
#
|
||||
# This file is part of MediaCrawler project.
|
||||
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tests/test_store_factory.py
|
||||
# GitHub: https://github.com/NanmiCoder
|
||||
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||||
#
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
"""
|
||||
Unit tests for Store Factory functionality
|
||||
"""
|
||||
@@ -10,6 +27,7 @@ from store.xhs import XhsStoreFactory
|
||||
from store.xhs._store_impl import (
|
||||
XhsCsvStoreImplement,
|
||||
XhsJsonStoreImplement,
|
||||
XhsJsonlStoreImplement,
|
||||
XhsDbStoreImplement,
|
||||
XhsSqliteStoreImplement,
|
||||
XhsMongoStoreImplement,
|
||||
@@ -19,57 +37,63 @@ from store.xhs._store_impl import (
|
||||
|
||||
class TestXhsStoreFactory:
|
||||
"""Test cases for XhsStoreFactory"""
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'csv')
|
||||
def test_create_csv_store(self):
|
||||
"""Test creating CSV store"""
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsCsvStoreImplement)
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'json')
|
||||
def test_create_json_store(self):
|
||||
"""Test creating JSON store"""
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsJsonStoreImplement)
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'db')
|
||||
def test_create_db_store(self):
|
||||
"""Test creating database store"""
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsDbStoreImplement)
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'sqlite')
|
||||
def test_create_sqlite_store(self):
|
||||
"""Test creating SQLite store"""
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsSqliteStoreImplement)
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'mongodb')
|
||||
def test_create_mongodb_store(self):
|
||||
"""Test creating MongoDB store"""
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsMongoStoreImplement)
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'excel')
|
||||
def test_create_excel_store(self):
|
||||
"""Test creating Excel store"""
|
||||
# ContextVar cannot be mocked, so we test with actual value
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsExcelStoreImplement)
|
||||
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'jsonl')
|
||||
def test_create_jsonl_store(self):
|
||||
"""Test creating JSONL store"""
|
||||
store = XhsStoreFactory.create_store()
|
||||
assert isinstance(store, XhsJsonlStoreImplement)
|
||||
|
||||
@patch('config.SAVE_DATA_OPTION', 'invalid')
|
||||
def test_invalid_store_option(self):
|
||||
"""Test that invalid store option raises ValueError"""
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
XhsStoreFactory.create_store()
|
||||
|
||||
|
||||
assert "Invalid save option" in str(exc_info.value)
|
||||
|
||||
|
||||
def test_all_stores_registered(self):
|
||||
"""Test that all store types are registered"""
|
||||
expected_stores = ['csv', 'json', 'db', 'sqlite', 'mongodb', 'excel']
|
||||
|
||||
expected_stores = ['csv', 'json', 'jsonl', 'db', 'postgres', 'sqlite', 'mongodb', 'excel']
|
||||
|
||||
for store_type in expected_stores:
|
||||
assert store_type in XhsStoreFactory.STORES
|
||||
|
||||
|
||||
assert len(XhsStoreFactory.STORES) == len(expected_stores)
|
||||
|
||||
@@ -53,6 +53,12 @@ class AsyncFileWriter:
|
||||
await writer.writeheader()
|
||||
await writer.writerow(item)
|
||||
|
||||
async def write_to_jsonl(self, item: Dict, item_type: str):
|
||||
file_path = self._get_file_path('jsonl', item_type)
|
||||
async with self.lock:
|
||||
async with aiofiles.open(file_path, 'a', encoding='utf-8') as f:
|
||||
await f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
async def write_single_item_to_json(self, item: Dict, item_type: str):
|
||||
file_path = self._get_file_path('json', item_type)
|
||||
async with self.lock:
|
||||
@@ -85,22 +91,32 @@ class AsyncFileWriter:
|
||||
return
|
||||
|
||||
try:
|
||||
# Read comments from JSON file
|
||||
comments_file_path = self._get_file_path('json', 'comments')
|
||||
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
|
||||
# Read comments from JSON or JSONL file
|
||||
comments_data = []
|
||||
jsonl_file_path = self._get_file_path('jsonl', 'comments')
|
||||
json_file_path = self._get_file_path('json', 'comments')
|
||||
|
||||
if os.path.exists(jsonl_file_path) and os.path.getsize(jsonl_file_path) > 0:
|
||||
async with aiofiles.open(jsonl_file_path, 'r', encoding='utf-8') as f:
|
||||
async for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
comments_data.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
elif os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0:
|
||||
async with aiofiles.open(json_file_path, 'r', encoding='utf-8') as f:
|
||||
content = await f.read()
|
||||
if content:
|
||||
comments_data = json.loads(content)
|
||||
if not isinstance(comments_data, list):
|
||||
comments_data = [comments_data]
|
||||
|
||||
if not comments_data:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments data found")
|
||||
return
|
||||
|
||||
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
|
||||
content = await f.read()
|
||||
if not content:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
|
||||
return
|
||||
|
||||
comments_data = json.loads(content)
|
||||
if not isinstance(comments_data, list):
|
||||
comments_data = [comments_data]
|
||||
|
||||
# Filter comments data to only include 'content' field
|
||||
# Handle different comment data structures across platforms
|
||||
filtered_data = []
|
||||
|
||||
Reference in New Issue
Block a user