mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 07:01:02 +08:00
feat(database): add PostgreSQL support and fix Windows subprocess encoding
This commit is contained in:
46
.env.example
Normal file
46
.env.example
Normal file
@@ -0,0 +1,46 @@
|
||||
# MySQL Configuration
|
||||
MYSQL_DB_PWD=123456
|
||||
MYSQL_DB_USER=root
|
||||
MYSQL_DB_HOST=localhost
|
||||
MYSQL_DB_PORT=3306
|
||||
MYSQL_DB_NAME=media_crawler
|
||||
|
||||
# Redis Configuration
|
||||
REDIS_DB_HOST=127.0.0.1
|
||||
REDIS_DB_PWD=123456
|
||||
REDIS_DB_PORT=6379
|
||||
REDIS_DB_NUM=0
|
||||
|
||||
# MongoDB Configuration
|
||||
MONGODB_HOST=localhost
|
||||
MONGODB_PORT=27017
|
||||
MONGODB_USER=
|
||||
MONGODB_PWD=
|
||||
MONGODB_DB_NAME=media_crawler
|
||||
|
||||
# PostgreSQL Configuration
|
||||
POSTGRES_DB_PWD=123456
|
||||
POSTGRES_DB_USER=postgres
|
||||
POSTGRES_DB_HOST=localhost
|
||||
POSTGRES_DB_PORT=5432
|
||||
POSTGRES_DB_NAME=media_crawler
|
||||
|
||||
# Proxy Configuration (Wandou HTTP)
|
||||
# your_wandou_http_app_key
|
||||
WANDOU_APP_KEY=
|
||||
|
||||
# Proxy Configuration (Kuaidaili)
|
||||
# your_kuaidaili_secret_id
|
||||
KDL_SECERT_ID=
|
||||
# your_kuaidaili_signature
|
||||
KDL_SIGNATURE=
|
||||
# your_kuaidaili_username
|
||||
KDL_USER_NAME=
|
||||
# your_kuaidaili_password
|
||||
KDL_USER_PWD=
|
||||
|
||||
# Proxy Configuration (Jisu HTTP)
|
||||
# Get JiSu HTTP IP extraction key value
|
||||
jisu_key=
|
||||
# Get JiSu HTTP IP extraction encryption signature
|
||||
jisu_crypto=
|
||||
@@ -124,6 +124,7 @@ class CrawlerManager:
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
encoding='utf-8',
|
||||
bufsize=1,
|
||||
cwd=str(self._project_root),
|
||||
env={**os.environ, "PYTHONUNBUFFERED": "1"}
|
||||
|
||||
@@ -73,6 +73,7 @@ class SaveDataOptionEnum(str, Enum):
|
||||
SQLITE = "sqlite"
|
||||
MONGODB = "mongodb"
|
||||
EXCEL = "excel"
|
||||
POSTGRES = "postgres"
|
||||
|
||||
|
||||
class InitDbOptionEnum(str, Enum):
|
||||
@@ -80,6 +81,7 @@ class InitDbOptionEnum(str, Enum):
|
||||
|
||||
SQLITE = "sqlite"
|
||||
MYSQL = "mysql"
|
||||
POSTGRES = "postgres"
|
||||
|
||||
|
||||
def _to_bool(value: bool | str) -> bool:
|
||||
@@ -210,7 +212,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
SaveDataOptionEnum,
|
||||
typer.Option(
|
||||
"--save_data_option",
|
||||
help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file)",
|
||||
help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file | postgres=PostgreSQL database)",
|
||||
rich_help_panel="Storage Configuration",
|
||||
),
|
||||
] = _coerce_enum(
|
||||
@@ -220,7 +222,7 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
Optional[InitDbOptionEnum],
|
||||
typer.Option(
|
||||
"--init_db",
|
||||
help="Initialize database table structure (sqlite | mysql)",
|
||||
help="Initialize database table structure (sqlite | mysql | postgres)",
|
||||
rich_help_panel="Storage Configuration",
|
||||
),
|
||||
] = None,
|
||||
|
||||
@@ -70,8 +70,8 @@ BROWSER_LAUNCH_TIMEOUT = 60
|
||||
# 设置为False可以保持浏览器运行,便于调试
|
||||
AUTO_CLOSE_BROWSER = True
|
||||
|
||||
# 数据保存类型选项配置,支持五种类型:csv、db、json、sqlite、excel, 最好保存到DB,有排重的功能。
|
||||
SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel
|
||||
# 数据保存类型选项配置,支持六种类型:csv、db、json、sqlite、excel、postgres, 最好保存到DB,有排重的功能。
|
||||
SAVE_DATA_OPTION = "json" # csv or db or json or sqlite or excel or postgres
|
||||
|
||||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
|
||||
@@ -37,7 +37,7 @@ mysql_db_config = {
|
||||
|
||||
|
||||
# redis config
|
||||
REDIS_DB_HOST = "127.0.0.1" # your redis host
|
||||
REDIS_DB_HOST = os.getenv("REDIS_DB_HOST", "127.0.0.1") # your redis host
|
||||
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
|
||||
REDIS_DB_PORT = os.getenv("REDIS_DB_PORT", 6379) # your redis port
|
||||
REDIS_DB_NUM = os.getenv("REDIS_DB_NUM", 0) # your redis db num
|
||||
@@ -67,3 +67,18 @@ mongodb_config = {
|
||||
"password": MONGODB_PWD,
|
||||
"db_name": MONGODB_DB_NAME,
|
||||
}
|
||||
|
||||
# postgres config
|
||||
POSTGRES_DB_PWD = os.getenv("POSTGRES_DB_PWD", "123456")
|
||||
POSTGRES_DB_USER = os.getenv("POSTGRES_DB_USER", "postgres")
|
||||
POSTGRES_DB_HOST = os.getenv("POSTGRES_DB_HOST", "localhost")
|
||||
POSTGRES_DB_PORT = os.getenv("POSTGRES_DB_PORT", 5432)
|
||||
POSTGRES_DB_NAME = os.getenv("POSTGRES_DB_NAME", "media_crawler")
|
||||
|
||||
postgres_db_config = {
|
||||
"user": POSTGRES_DB_USER,
|
||||
"password": POSTGRES_DB_PWD,
|
||||
"host": POSTGRES_DB_HOST,
|
||||
"port": POSTGRES_DB_PORT,
|
||||
"db_name": POSTGRES_DB_NAME,
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ from sqlalchemy.orm import sessionmaker
|
||||
from contextlib import asynccontextmanager
|
||||
from .models import Base
|
||||
import config
|
||||
from config.db_config import mysql_db_config, sqlite_db_config
|
||||
from config.db_config import mysql_db_config, sqlite_db_config, postgres_db_config
|
||||
|
||||
# Keep a cache of engines
|
||||
_engines = {}
|
||||
@@ -36,6 +36,18 @@ async def create_database_if_not_exists(db_type: str):
|
||||
async with engine.connect() as conn:
|
||||
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
|
||||
await engine.dispose()
|
||||
elif db_type == "postgres":
|
||||
# Connect to the default 'postgres' database
|
||||
server_url = f"postgresql+asyncpg://{postgres_db_config['user']}:{postgres_db_config['password']}@{postgres_db_config['host']}:{postgres_db_config['port']}/postgres"
|
||||
print(f"[init_db] Connecting to Postgres: host={postgres_db_config['host']}, port={postgres_db_config['port']}, user={postgres_db_config['user']}, dbname=postgres")
|
||||
# Isolation level AUTOCOMMIT is required for CREATE DATABASE
|
||||
engine = create_async_engine(server_url, echo=False, isolation_level="AUTOCOMMIT")
|
||||
async with engine.connect() as conn:
|
||||
# Check if database exists
|
||||
result = await conn.execute(text(f"SELECT 1 FROM pg_database WHERE datname = '{postgres_db_config['db_name']}'"))
|
||||
if not result.scalar():
|
||||
await conn.execute(text(f"CREATE DATABASE {postgres_db_config['db_name']}"))
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
def get_async_engine(db_type: str = None):
|
||||
@@ -52,6 +64,8 @@ def get_async_engine(db_type: str = None):
|
||||
db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
|
||||
elif db_type == "mysql" or db_type == "db":
|
||||
db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
|
||||
elif db_type == "postgres":
|
||||
db_url = f"postgresql+asyncpg://{postgres_db_config['user']}:{postgres_db_config['password']}@{postgres_db_config['host']}:{postgres_db_config['port']}/{postgres_db_config['db_name']}"
|
||||
else:
|
||||
raise ValueError(f"Unsupported database type: {db_type}")
|
||||
|
||||
|
||||
@@ -21,6 +21,9 @@ MediaCrawler 支持多种数据存储方式,您可以根据需求选择最适
|
||||
- **MySQL 数据库**:支持关系型数据库 MySQL 中保存(需要提前创建数据库)
|
||||
1. 初始化:`--init_db mysql`
|
||||
2. 数据存储:`--save_data_option db`(db 参数为兼容历史更新保留)
|
||||
- **PostgreSQL 数据库**:支持高级关系型数据库 PostgreSQL 中保存(推荐生产环境使用)
|
||||
1. 初始化:`--init_db postgres`
|
||||
2. 数据存储:`--save_data_option postgres`
|
||||
|
||||
#### 使用示例
|
||||
|
||||
@@ -41,6 +44,13 @@ uv run main.py --init_db mysql
|
||||
uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
|
||||
```
|
||||
|
||||
```shell
|
||||
# 初始化 PostgreSQL 数据库
|
||||
uv run main.py --init_db postgres
|
||||
# 使用 PostgreSQL 存储数据
|
||||
uv run main.py --platform xhs --lt qrcode --type search --save_data_option postgres
|
||||
```
|
||||
|
||||
```shell
|
||||
# 使用 CSV 存储数据
|
||||
uv run main.py --platform xhs --lt qrcode --type search --save_data_option csv
|
||||
|
||||
@@ -38,6 +38,7 @@ dependencies = [
|
||||
"pytest>=7.4.0",
|
||||
"pytest-asyncio>=0.21.0",
|
||||
"websockets>=15.0.1",
|
||||
"asyncpg>=0.31.0",
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
|
||||
@@ -35,6 +35,7 @@ class BiliStoreFactory:
|
||||
STORES = {
|
||||
"csv": BiliCsvStoreImplement,
|
||||
"db": BiliDbStoreImplement,
|
||||
"postgres": BiliDbStoreImplement,
|
||||
"json": BiliJsonStoreImplement,
|
||||
"sqlite": BiliSqliteStoreImplement,
|
||||
"mongodb": BiliMongoStoreImplement,
|
||||
|
||||
@@ -128,16 +128,23 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
content_item: content item dict
|
||||
"""
|
||||
video_id = content_item.get("video_id")
|
||||
video_id = int(content_item.get("video_id"))
|
||||
content_item["video_id"] = video_id
|
||||
content_item["user_id"] = int(content_item.get("user_id", 0) or 0)
|
||||
content_item["liked_count"] = int(content_item.get("liked_count", 0) or 0)
|
||||
content_item["create_time"] = int(content_item.get("create_time", 0) or 0)
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id))
|
||||
video_detail = result.scalar_one_or_none()
|
||||
|
||||
if not video_detail:
|
||||
content_item["add_ts"] = utils.get_current_timestamp()
|
||||
content_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
new_content = BilibiliVideo(**content_item)
|
||||
session.add(new_content)
|
||||
else:
|
||||
content_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
for key, value in content_item.items():
|
||||
setattr(video_detail, key, value)
|
||||
await session.commit()
|
||||
@@ -148,16 +155,25 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
comment_id = int(comment_item.get("comment_id"))
|
||||
comment_item["comment_id"] = comment_id
|
||||
comment_item["video_id"] = int(comment_item.get("video_id", 0) or 0)
|
||||
comment_item["create_time"] = int(comment_item.get("create_time", 0) or 0)
|
||||
comment_item["like_count"] = str(comment_item.get("like_count", "0"))
|
||||
comment_item["sub_comment_count"] = str(comment_item.get("sub_comment_count", "0"))
|
||||
comment_item["parent_comment_id"] = str(comment_item.get("parent_comment_id", "0"))
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id))
|
||||
comment_detail = result.scalar_one_or_none()
|
||||
|
||||
if not comment_detail:
|
||||
comment_item["add_ts"] = utils.get_current_timestamp()
|
||||
comment_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
new_comment = BilibiliVideoComment(**comment_item)
|
||||
session.add(new_comment)
|
||||
else:
|
||||
comment_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
for key, value in comment_item.items():
|
||||
setattr(comment_detail, key, value)
|
||||
await session.commit()
|
||||
@@ -168,16 +184,24 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
creator: creator item dict
|
||||
"""
|
||||
creator_id = creator.get("user_id")
|
||||
creator_id = int(creator.get("user_id"))
|
||||
creator["user_id"] = creator_id
|
||||
creator["total_fans"] = int(creator.get("total_fans", 0) or 0)
|
||||
creator["total_liked"] = int(creator.get("total_liked", 0) or 0)
|
||||
creator["user_rank"] = int(creator.get("user_rank", 0) or 0)
|
||||
creator["is_official"] = int(creator.get("is_official", 0) or 0)
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliUpInfo).where(BilibiliUpInfo.user_id == creator_id))
|
||||
creator_detail = result.scalar_one_or_none()
|
||||
|
||||
if not creator_detail:
|
||||
creator["add_ts"] = utils.get_current_timestamp()
|
||||
creator["last_modify_ts"] = utils.get_current_timestamp()
|
||||
new_creator = BilibiliUpInfo(**creator)
|
||||
session.add(new_creator)
|
||||
else:
|
||||
creator["last_modify_ts"] = utils.get_current_timestamp()
|
||||
for key, value in creator.items():
|
||||
setattr(creator_detail, key, value)
|
||||
await session.commit()
|
||||
@@ -188,8 +212,11 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
contact_item: contact item dict
|
||||
"""
|
||||
up_id = contact_item.get("up_id")
|
||||
fan_id = contact_item.get("fan_id")
|
||||
up_id = int(contact_item.get("up_id"))
|
||||
fan_id = int(contact_item.get("fan_id"))
|
||||
contact_item["up_id"] = up_id
|
||||
contact_item["fan_id"] = fan_id
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(
|
||||
select(BilibiliContactInfo).where(BilibiliContactInfo.up_id == up_id, BilibiliContactInfo.fan_id == fan_id)
|
||||
@@ -198,9 +225,11 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
|
||||
if not contact_detail:
|
||||
contact_item["add_ts"] = utils.get_current_timestamp()
|
||||
contact_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
new_contact = BilibiliContactInfo(**contact_item)
|
||||
session.add(new_contact)
|
||||
else:
|
||||
contact_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
for key, value in contact_item.items():
|
||||
setattr(contact_detail, key, value)
|
||||
await session.commit()
|
||||
@@ -211,16 +240,20 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
dynamic_item: dynamic item dict
|
||||
"""
|
||||
dynamic_id = dynamic_item.get("dynamic_id")
|
||||
dynamic_id = int(dynamic_item.get("dynamic_id"))
|
||||
dynamic_item["dynamic_id"] = dynamic_id
|
||||
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id))
|
||||
dynamic_detail = result.scalar_one_or_none()
|
||||
|
||||
if not dynamic_detail:
|
||||
dynamic_item["add_ts"] = utils.get_current_timestamp()
|
||||
dynamic_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
new_dynamic = BilibiliUpDynamic(**dynamic_item)
|
||||
session.add(new_dynamic)
|
||||
else:
|
||||
dynamic_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
for key, value in dynamic_item.items():
|
||||
setattr(dynamic_detail, key, value)
|
||||
await session.commit()
|
||||
|
||||
@@ -34,6 +34,7 @@ class DouyinStoreFactory:
|
||||
STORES = {
|
||||
"csv": DouyinCsvStoreImplement,
|
||||
"db": DouyinDbStoreImplement,
|
||||
"postgres": DouyinDbStoreImplement,
|
||||
"json": DouyinJsonStoreImplement,
|
||||
"sqlite": DouyinSqliteStoreImplement,
|
||||
"mongodb": DouyinMongoStoreImplement,
|
||||
|
||||
@@ -97,7 +97,7 @@ class DouyinDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
content_item: content item dict
|
||||
"""
|
||||
aweme_id = content_item.get("aweme_id")
|
||||
aweme_id = int(content_item.get("aweme_id"))
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(DouyinAweme).where(DouyinAweme.aweme_id == aweme_id))
|
||||
aweme_detail = result.scalar_one_or_none()
|
||||
@@ -118,7 +118,7 @@ class DouyinDbStoreImplement(AbstractStore):
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
comment_id = int(comment_item.get("comment_id"))
|
||||
async with get_session() as session:
|
||||
result = await session.execute(select(DouyinAwemeComment).where(DouyinAwemeComment.comment_id == comment_id))
|
||||
comment_detail = result.scalar_one_or_none()
|
||||
|
||||
@@ -34,6 +34,7 @@ class KuaishouStoreFactory:
|
||||
STORES = {
|
||||
"csv": KuaishouCsvStoreImplement,
|
||||
"db": KuaishouDbStoreImplement,
|
||||
"postgres": KuaishouDbStoreImplement,
|
||||
"json": KuaishouJsonStoreImplement,
|
||||
"sqlite": KuaishouSqliteStoreImplement,
|
||||
"mongodb": KuaishouMongoStoreImplement,
|
||||
|
||||
@@ -109,7 +109,8 @@ class KuaishouDbStoreImplement(AbstractStore):
|
||||
session.add(new_content)
|
||||
else:
|
||||
for key, value in content_item.items():
|
||||
setattr(video_detail, key, value)
|
||||
if hasattr(video_detail, key):
|
||||
setattr(video_detail, key, value)
|
||||
await session.commit()
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
@@ -130,7 +131,8 @@ class KuaishouDbStoreImplement(AbstractStore):
|
||||
session.add(new_comment)
|
||||
else:
|
||||
for key, value in comment_item.items():
|
||||
setattr(comment_detail, key, value)
|
||||
if hasattr(comment_detail, key):
|
||||
setattr(comment_detail, key, value)
|
||||
await session.commit()
|
||||
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ class TieBaStoreFactory:
|
||||
STORES = {
|
||||
"csv": TieBaCsvStoreImplement,
|
||||
"db": TieBaDbStoreImplement,
|
||||
"postgres": TieBaDbStoreImplement,
|
||||
"json": TieBaJsonStoreImplement,
|
||||
"sqlite": TieBaSqliteStoreImplement,
|
||||
"mongodb": TieBaMongoStoreImplement,
|
||||
|
||||
@@ -35,6 +35,7 @@ class WeibostoreFactory:
|
||||
STORES = {
|
||||
"csv": WeiboCsvStoreImplement,
|
||||
"db": WeiboDbStoreImplement,
|
||||
"postgres": WeiboDbStoreImplement,
|
||||
"json": WeiboJsonStoreImplement,
|
||||
"sqlite": WeiboSqliteStoreImplement,
|
||||
"mongodb": WeiboMongoStoreImplement,
|
||||
|
||||
@@ -108,7 +108,8 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
note_id = content_item.get("note_id")
|
||||
note_id = int(content_item.get("note_id"))
|
||||
content_item["note_id"] = note_id
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
|
||||
res = await session.execute(stmt)
|
||||
@@ -134,7 +135,14 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
comment_id = comment_item.get("comment_id")
|
||||
comment_id = int(comment_item.get("comment_id"))
|
||||
comment_item["comment_id"] = comment_id
|
||||
comment_item["note_id"] = int(comment_item.get("note_id", 0) or 0)
|
||||
comment_item["create_time"] = int(comment_item.get("create_time", 0) or 0)
|
||||
comment_item["comment_like_count"] = str(comment_item.get("comment_like_count", "0"))
|
||||
comment_item["sub_comment_count"] = str(comment_item.get("sub_comment_count", "0"))
|
||||
comment_item["parent_comment_id"] = str(comment_item.get("parent_comment_id", "0"))
|
||||
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
|
||||
res = await session.execute(stmt)
|
||||
@@ -160,7 +168,8 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
user_id = creator.get("user_id")
|
||||
user_id = int(creator.get("user_id"))
|
||||
creator["user_id"] = user_id
|
||||
async with get_session() as session:
|
||||
stmt = select(WeiboCreator).where(WeiboCreator.user_id == user_id)
|
||||
res = await session.execute(stmt)
|
||||
|
||||
@@ -34,6 +34,7 @@ class XhsStoreFactory:
|
||||
STORES = {
|
||||
"csv": XhsCsvStoreImplement,
|
||||
"db": XhsDbStoreImplement,
|
||||
"postgres": XhsDbStoreImplement,
|
||||
"json": XhsJsonStoreImplement,
|
||||
"sqlite": XhsSqliteStoreImplement,
|
||||
"mongodb": XhsMongoStoreImplement,
|
||||
|
||||
@@ -189,9 +189,9 @@ class XhsDbStoreImplement(AbstractStore):
|
||||
create_time=comment_item.get("create_time"),
|
||||
note_id=comment_item.get("note_id"),
|
||||
content=comment_item.get("content"),
|
||||
sub_comment_count=comment_item.get("sub_comment_count"),
|
||||
sub_comment_count=int(comment_item.get("sub_comment_count", 0) or 0),
|
||||
pictures=json.dumps(comment_item.get("pictures")),
|
||||
parent_comment_id=comment_item.get("parent_comment_id"),
|
||||
parent_comment_id=str(comment_item.get("parent_comment_id", "")),
|
||||
like_count=str(comment_item.get("like_count"))
|
||||
)
|
||||
session.add(comment)
|
||||
@@ -202,7 +202,7 @@ class XhsDbStoreImplement(AbstractStore):
|
||||
update_data = {
|
||||
"last_modify_ts": last_modify_ts,
|
||||
"like_count": str(comment_item.get("like_count")),
|
||||
"sub_comment_count": comment_item.get("sub_comment_count"),
|
||||
"sub_comment_count": int(comment_item.get("sub_comment_count", 0) or 0),
|
||||
}
|
||||
stmt = update(XhsNoteComment).where(XhsNoteComment.comment_id == comment_id).values(**update_data)
|
||||
await session.execute(stmt)
|
||||
|
||||
@@ -38,6 +38,7 @@ class ZhihuStoreFactory:
|
||||
STORES = {
|
||||
"csv": ZhihuCsvStoreImplement,
|
||||
"db": ZhihuDbStoreImplement,
|
||||
"postgres": ZhihuDbStoreImplement,
|
||||
"json": ZhihuJsonStoreImplement,
|
||||
"sqlite": ZhihuSqliteStoreImplement,
|
||||
"mongodb": ZhihuMongoStoreImplement,
|
||||
|
||||
@@ -110,7 +110,8 @@ class ZhihuDbStoreImplement(AbstractStore):
|
||||
existing_content = result.scalars().first()
|
||||
if existing_content:
|
||||
for key, value in content_item.items():
|
||||
setattr(existing_content, key, value)
|
||||
if hasattr(existing_content, key):
|
||||
setattr(existing_content, key, value)
|
||||
else:
|
||||
new_content = ZhihuContent(**content_item)
|
||||
session.add(new_content)
|
||||
@@ -129,7 +130,8 @@ class ZhihuDbStoreImplement(AbstractStore):
|
||||
existing_comment = result.scalars().first()
|
||||
if existing_comment:
|
||||
for key, value in comment_item.items():
|
||||
setattr(existing_comment, key, value)
|
||||
if hasattr(existing_comment, key):
|
||||
setattr(existing_comment, key, value)
|
||||
else:
|
||||
new_comment = ZhihuComment(**comment_item)
|
||||
session.add(new_comment)
|
||||
@@ -148,7 +150,8 @@ class ZhihuDbStoreImplement(AbstractStore):
|
||||
existing_creator = result.scalars().first()
|
||||
if existing_creator:
|
||||
for key, value in creator.items():
|
||||
setattr(existing_creator, key, value)
|
||||
if hasattr(existing_creator, key):
|
||||
setattr(existing_creator, key, value)
|
||||
else:
|
||||
new_creator = ZhihuCreator(**creator)
|
||||
session.add(new_creator)
|
||||
|
||||
@@ -228,7 +228,7 @@ class BrowserLauncher:
|
||||
# Try to get version info
|
||||
try:
|
||||
result = subprocess.run([browser_path, "--version"],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
capture_output=True, text=True, encoding='utf-8', errors='ignore', timeout=5)
|
||||
version = result.stdout.strip() if result.stdout else "Unknown Version"
|
||||
except:
|
||||
version = "Unknown Version"
|
||||
@@ -266,6 +266,8 @@ class BrowserLauncher:
|
||||
["taskkill", "/F", "/T", "/PID", str(process.pid)],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
encoding='utf-8',
|
||||
errors='ignore'
|
||||
)
|
||||
process.wait(timeout=5)
|
||||
else:
|
||||
|
||||
51
uv.lock
generated
51
uv.lock
generated
@@ -1,5 +1,4 @@
|
||||
version = 1
|
||||
revision = 1
|
||||
requires-python = ">=3.11"
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.12' and sys_platform == 'darwin'",
|
||||
@@ -113,6 +112,54 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/08/7de4f4a17196c355e4706ceba0ab60627541c78011881a7c69f41c6414c5/asyncmy-0.2.10-cp312-cp312-win_amd64.whl", hash = "sha256:4c6674073be97ffb7ac7f909e803008b23e50281131fef4e30b7b2162141a574", size = 1679064 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "asyncpg"
|
||||
version = "0.31.0"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fe/cc/d18065ce2380d80b1bcce927c24a2642efd38918e33fd724bc4bca904877/asyncpg-0.31.0.tar.gz", hash = "sha256:c989386c83940bfbd787180f2b1519415e2d3d6277a70d9d0f0145ac73500735", size = 993667 }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/08/17/cc02bc49bc350623d050fa139e34ea512cd6e020562f2a7312a7bcae4bc9/asyncpg-0.31.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eee690960e8ab85063ba93af2ce128c0f52fd655fdff9fdb1a28df01329f031d", size = 643159 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/62/4ded7d400a7b651adf06f49ea8f73100cca07c6df012119594d1e3447aa6/asyncpg-0.31.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2657204552b75f8288de08ca60faf4a99a65deef3a71d1467454123205a88fab", size = 638157 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/d6/5b/4179538a9a72166a0bf60ad783b1ef16efb7960e4d7b9afe9f77a5551680/asyncpg-0.31.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a429e842a3a4b4ea240ea52d7fe3f82d5149853249306f7ff166cb9948faa46c", size = 2918051 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/e6/35/c27719ae0536c5b6e61e4701391ffe435ef59539e9360959240d6e47c8c8/asyncpg-0.31.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0807be46c32c963ae40d329b3a686356e417f674c976c07fa49f1b30303f109", size = 2972640 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/43/f4/01ebb9207f29e645a64699b9ce0eefeff8e7a33494e1d29bb53736f7766b/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e5d5098f63beeae93512ee513d4c0c53dc12e9aa2b7a1af5a81cddf93fe4e4da", size = 2851050 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/f4/03ff1426acc87be0f4e8d40fa2bff5c3952bef0080062af9efc2212e3be8/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37fc6c00a814e18eef51833545d1891cac9aa69140598bb076b4cd29b3e010b9", size = 2962574 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/39/cc788dfca3d4060f9d93e67be396ceec458dfc429e26139059e58c2c244d/asyncpg-0.31.0-cp311-cp311-win32.whl", hash = "sha256:5a4af56edf82a701aece93190cc4e094d2df7d33f6e915c222fb09efbb5afc24", size = 521076 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/fc/735af5384c029eb7f1ca60ccb8fa95521dbdaeef788edf4cecfc604c3cab/asyncpg-0.31.0-cp311-cp311-win_amd64.whl", hash = "sha256:480c4befbdf079c14c9ca43c8c5e1fe8b6296c96f1f927158d4f1e750aacc047", size = 584980 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/a6/59d0a146e61d20e18db7396583242e32e0f120693b67a8de43f1557033e2/asyncpg-0.31.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b44c31e1efc1c15188ef183f287c728e2046abb1d26af4d20858215d50d91fad", size = 662042 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/01/ffaa189dcb63a2471720615e60185c3f6327716fdc0fc04334436fbb7c65/asyncpg-0.31.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0c89ccf741c067614c9b5fc7f1fc6f3b61ab05ae4aaa966e6fd6b93097c7d20d", size = 638504 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/62/3f699ba45d8bd24c5d65392190d19656d74ff0185f42e19d0bbd973bb371/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:12b3b2e39dc5470abd5e98c8d3373e4b1d1234d9fbdedf538798b2c13c64460a", size = 3426241 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/d1/a867c2150f9c6e7af6462637f613ba67f78a314b00db220cd26ff559d532/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:aad7a33913fb8bcb5454313377cc330fbb19a0cd5faa7272407d8a0c4257b671", size = 3520321 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/1a/cce4c3f246805ecd285a3591222a2611141f1669d002163abef999b60f98/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3df118d94f46d85b2e434fd62c84cb66d5834d5a890725fe625f498e72e4d5ec", size = 3316685 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/40/ae/0fc961179e78cc579e138fad6eb580448ecae64908f95b8cb8ee2f241f67/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd5b6efff3c17c3202d4b37189969acf8927438a238c6257f66be3c426beba20", size = 3471858 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/b2/b20e09670be031afa4cbfabd645caece7f85ec62d69c312239de568e058e/asyncpg-0.31.0-cp312-cp312-win32.whl", hash = "sha256:027eaa61361ec735926566f995d959ade4796f6a49d3bde17e5134b9964f9ba8", size = 527852 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b5/f0/f2ed1de154e15b107dc692262395b3c17fc34eafe2a78fc2115931561730/asyncpg-0.31.0-cp312-cp312-win_amd64.whl", hash = "sha256:72d6bdcbc93d608a1158f17932de2321f68b1a967a13e014998db87a72ed3186", size = 597175 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/95/11/97b5c2af72a5d0b9bc3fa30cd4b9ce22284a9a943a150fdc768763caf035/asyncpg-0.31.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c204fab1b91e08b0f47e90a75d1b3c62174dab21f670ad6c5d0f243a228f015b", size = 661111 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/1b/71/157d611c791a5e2d0423f09f027bd499935f0906e0c2a416ce712ba51ef3/asyncpg-0.31.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:54a64f91839ba59008eccf7aad2e93d6e3de688d796f35803235ea1c4898ae1e", size = 636928 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/fc/9e3486fb2bbe69d4a867c0b76d68542650a7ff1574ca40e84c3111bb0c6e/asyncpg-0.31.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0e0822b1038dc7253b337b0f3f676cadc4ac31b126c5d42691c39691962e403", size = 3424067 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/c6/8c9d076f73f07f995013c791e018a1cd5f31823c2a3187fc8581706aa00f/asyncpg-0.31.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bef056aa502ee34204c161c72ca1f3c274917596877f825968368b2c33f585f4", size = 3518156 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/3b/60683a0baf50fbc546499cfb53132cb6835b92b529a05f6a81471ab60d0c/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0bfbcc5b7ffcd9b75ab1558f00db2ae07db9c80637ad1b2469c43df79d7a5ae2", size = 3319636 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/dc/8487df0f69bd398a61e1792b3cba0e47477f214eff085ba0efa7eac9ce87/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22bc525ebbdc24d1261ecbf6f504998244d4e3be1721784b5f64664d61fbe602", size = 3472079 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/a1/c5bbeeb8531c05c89135cb8b28575ac2fac618bcb60119ee9696c3faf71c/asyncpg-0.31.0-cp313-cp313-win32.whl", hash = "sha256:f890de5e1e4f7e14023619399a471ce4b71f5418cd67a51853b9910fdfa73696", size = 527606 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/91/66/b25ccb84a246b470eb943b0107c07edcae51804912b824054b3413995a10/asyncpg-0.31.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab", size = 596569 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/36/e9450d62e84a13aea6580c83a47a437f26c7ca6fa0f0fd40b6670793ea30/asyncpg-0.31.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f6b56b91bb0ffc328c4e3ed113136cddd9deefdf5f79ab448598b9772831df44", size = 660867 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/4b/1d0a2b33b3102d210439338e1beea616a6122267c0df459ff0265cd5807a/asyncpg-0.31.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:334dec28cf20d7f5bb9e45b39546ddf247f8042a690bff9b9573d00086e69cb5", size = 638349 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/41/aa/e7f7ac9a7974f08eff9183e392b2d62516f90412686532d27e196c0f0eeb/asyncpg-0.31.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:98cc158c53f46de7bb677fd20c417e264fc02b36d901cc2a43bd6cb0dc6dbfd2", size = 3410428 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/6f/de/bf1b60de3dede5c2731e6788617a512bc0ebd9693eac297ee74086f101d7/asyncpg-0.31.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9322b563e2661a52e3cdbc93eed3be7748b289f792e0011cb2720d278b366ce2", size = 3471678 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/78/fc3ade003e22d8bd53aaf8f75f4be48f0b460fa73738f0391b9c856a9147/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19857a358fc811d82227449b7ca40afb46e75b33eb8897240c3839dd8b744218", size = 3313505 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/e9/73eb8a6789e927816f4705291be21f2225687bfa97321e40cd23055e903a/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ba5f8886e850882ff2c2ace5732300e99193823e8107e2c53ef01c1ebfa1e85d", size = 3434744 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/08/4b/f10b880534413c65c5b5862f79b8e81553a8f364e5238832ad4c0af71b7f/asyncpg-0.31.0-cp314-cp314-win32.whl", hash = "sha256:cea3a0b2a14f95834cee29432e4ddc399b95700eb1d51bbc5bfee8f31fa07b2b", size = 532251 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/d3/2d/7aa40750b7a19efa5d66e67fc06008ca0f27ba1bd082e457ad82f59aba49/asyncpg-0.31.0-cp314-cp314-win_amd64.whl", hash = "sha256:04d19392716af6b029411a0264d92093b6e5e8285ae97a39957b9a9c14ea72be", size = 604901 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ce/fe/b9dfe349b83b9dee28cc42360d2c86b2cdce4cb551a2c2d27e156bcac84d/asyncpg-0.31.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bdb957706da132e982cc6856bb2f7b740603472b54c3ebc77fe60ea3e57e1bd2", size = 702280 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/81/e6be6e37e560bd91e6c23ea8a6138a04fd057b08cf63d3c5055c98e81c1d/asyncpg-0.31.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6d11b198111a72f47154fa03b85799f9be63701e068b43f84ac25da0bda9cb31", size = 682931 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/45/6009040da85a1648dd5bc75b3b0a062081c483e75a1a29041ae63a0bf0dc/asyncpg-0.31.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18c83b03bc0d1b23e6230f5bf8d4f217dc9bc08644ce0502a9d91dc9e634a9c7", size = 3581608 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/06/2e3d4d7608b0b2b3adbee0d0bd6a2d29ca0fc4d8a78f8277df04e2d1fd7b/asyncpg-0.31.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e009abc333464ff18b8f6fd146addffd9aaf63e79aa3bb40ab7a4c332d0c5e9e", size = 3498738 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/7d/aa/7d75ede780033141c51d83577ea23236ba7d3a23593929b32b49db8ed36e/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3b1fbcb0e396a5ca435a8826a87e5c2c2cc0c8c68eb6fadf82168056b0e53a8c", size = 3401026 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/7a/15e37d45e7f7c94facc1e9148c0e455e8f33c08f0b8a0b1deb2c5171771b/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8df714dba348efcc162d2adf02d213e5fab1bd9f557e1305633e851a61814a7a", size = 3429426 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/d5/71437c5f6ae5f307828710efbe62163974e71237d5d46ebd2869ea052d10/asyncpg-0.31.0-cp314-cp314t-win32.whl", hash = "sha256:1b41f1afb1033f2b44f3234993b15096ddc9cd71b21a42dbd87fc6a57b43d65d", size = 614495 },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/d7/8fb3044eaef08a310acfe23dae9a8e2e07d305edc29a53497e52bc76eca7/asyncpg-0.31.0-cp314-cp314t-win_amd64.whl", hash = "sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3", size = 706062 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2025.6.15"
|
||||
@@ -788,6 +835,7 @@ dependencies = [
|
||||
{ name = "aiosqlite" },
|
||||
{ name = "alembic" },
|
||||
{ name = "asyncmy" },
|
||||
{ name = "asyncpg" },
|
||||
{ name = "cryptography" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "httpx" },
|
||||
@@ -824,6 +872,7 @@ requires-dist = [
|
||||
{ name = "aiosqlite", specifier = ">=0.21.0" },
|
||||
{ name = "alembic", specifier = ">=1.16.5" },
|
||||
{ name = "asyncmy", specifier = ">=0.2.10" },
|
||||
{ name = "asyncpg", specifier = ">=0.31.0" },
|
||||
{ name = "cryptography", specifier = ">=45.0.7" },
|
||||
{ name = "fastapi", specifier = "==0.110.2" },
|
||||
{ name = "httpx", specifier = "==0.28.1" },
|
||||
|
||||
Reference in New Issue
Block a user