finish_all

This commit is contained in:
Bowenwin
2025-05-22 22:06:06 +08:00
parent 44e3d370ff
commit 59619fff0a
5 changed files with 74 additions and 18 deletions

View File

@@ -46,7 +46,7 @@ HEADLESS = False
SAVE_LOGIN_STATE = True
# 数据保存类型选项配置,支持三种类型csv、db、json, 最好保存到DB有排重的功能。
SAVE_DATA_OPTION = "csv" # csv or db or json
SAVE_DATA_OPTION = "json" # csv or db or json
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
@@ -72,12 +72,6 @@ ENABLE_GET_COMMENTS = True
# 爬取一级评论的数量控制(单视频/帖子)
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
# 爬取作者粉丝和关注列表数量控制(单作者)
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
# 爬取作者动态粉丝和关注列表数量控制(单作者)
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50
# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = False
@@ -206,4 +200,10 @@ END_DAY = '2024-01-01'
# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索
# 若为 False则忽略 START_DAY 与 END_DAY 设置的值
# 若为 True则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
ALL_DAY = False
ALL_DAY = False
# 爬取作者粉丝和关注列表数量控制(单作者)
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
# 爬取作者动态粉丝和关注列表数量控制(单作者)
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50

View File

@@ -512,8 +512,8 @@ class BilibiliCrawler(AbstractCrawler):
"sign": creator_unhandled_info.get("sign"),
"avatar": creator_unhandled_info.get("face"),
}
# await self.get_fans(creator_info, semaphore)
# await self.get_followings(creator_info, semaphore)
await self.get_fans(creator_info, semaphore)
await self.get_followings(creator_info, semaphore)
await self.get_dynamics(creator_info, semaphore)
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):

View File

@@ -179,9 +179,9 @@ async def batch_update_bilibili_creator_dynamics(creator_info: Dict, dynamics_li
"text": dynamic_text,
"type": dynamic_type,
"pub_ts": dynamic_pub_ts,
"comment": dynamic_comment,
"forward": dynamic_forward,
"like": dynamic_like,
"total_comments": dynamic_comment,
"total_forwards": dynamic_forward,
"total_liked": dynamic_like,
}
await update_bilibili_creator_dynamic(creator_info=creator_info, dynamic_info=dynamic_info)
@@ -210,9 +210,9 @@ async def update_bilibili_creator_dynamic(creator_info: Dict, dynamic_info: Dict
"text": dynamic_info["text"],
"type": dynamic_info["type"],
"pub_ts": dynamic_info["pub_ts"],
"comment": dynamic_info["comment"],
"forward": dynamic_info["forward"],
"like": dynamic_info["like"],
"total_comments": dynamic_info["total_comments"],
"total_forwards": dynamic_info["total_forwards"],
"total_liked": dynamic_info["total_liked"],
"last_modify_ts": utils.get_current_timestamp(),
}

View File

@@ -196,7 +196,7 @@ class BiliDbStoreImplement(AbstractStore):
creator["add_ts"] = utils.get_current_timestamp()
await add_new_creator(creator)
else:
await update_creator_by_creator_id(creator_id,creator_item=creator)
await update_creator_by_creator_id(creator_id, creator_item=creator)
async def store_contact(self, contact_item: Dict):
"""
@@ -232,6 +232,17 @@ class BiliDbStoreImplement(AbstractStore):
"""
from .bilibili_store_sql import (add_new_dynamic,
query_dynamic_by_dynamic_id,
update_dynamic_by_dynamic_id)
dynamic_id = dynamic_item.get("dynamic_id")
dynamic_detail = await query_dynamic_by_dynamic_id(dynamic_id=dynamic_id)
if not dynamic_detail:
dynamic_item["add_ts"] = utils.get_current_timestamp()
await add_new_dynamic(dynamic_item)
else:
await update_dynamic_by_dynamic_id(dynamic_id, dynamic_item=dynamic_item)
class BiliJsonStoreImplement(AbstractStore):

View File

@@ -66,7 +66,6 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i
return effect_row
async def query_comment_by_comment_id(comment_id: str) -> Dict:
"""
查询一条评论内容
@@ -158,6 +157,7 @@ async def update_creator_by_creator_id(creator_id: str, creator_item: Dict) -> i
effect_row: int = await async_db_conn.update_table("bilibili_up_info", creator_item, "user_id", creator_id)
return effect_row
async def query_contact_by_up_and_fan(up_id: str, fan_id: str) -> Dict:
"""
查询一条关联关系
@@ -204,3 +204,48 @@ async def update_contact_by_id(id: str, contact_item: Dict) -> int:
effect_row: int = await async_db_conn.update_table("bilibili_contact_info", contact_item, "id", id)
return effect_row
async def query_dynamic_by_dynamic_id(dynamic_id: str) -> Dict:
"""
查询一条动态信息
Args:
dynamic_id:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from bilibili_up_dynamic where dynamic_id = '{dynamic_id}'"
rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0:
return rows[0]
return dict()
async def add_new_dynamic(dynamic_item: Dict) -> int:
"""
新增动态信息
Args:
dynamic_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("bilibili_up_dynamic", dynamic_item)
return last_row_id
async def update_dynamic_by_dynamic_id(dynamic_id: str, dynamic_item: Dict) -> int:
"""
更新动态信息
Args:
dynamic_id:
dynamic_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("bilibili_up_dynamic", dynamic_item, "dynamic_id", dynamic_id)
return effect_row