From 59619fff0acfe0382033db084bedc86a182b76f4 Mon Sep 17 00:00:00 2001 From: Bowenwin Date: Thu, 22 May 2025 22:06:06 +0800 Subject: [PATCH] finish_all --- config/base_config.py | 16 ++++----- media_platform/bilibili/core.py | 4 +-- store/bilibili/__init__.py | 12 +++---- store/bilibili/bilibili_store_impl.py | 13 +++++++- store/bilibili/bilibili_store_sql.py | 47 ++++++++++++++++++++++++++- 5 files changed, 74 insertions(+), 18 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index 34539c9..d05347a 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -46,7 +46,7 @@ HEADLESS = False SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json, 最好保存到DB,有排重的功能。 -SAVE_DATA_OPTION = "csv" # csv or db or json +SAVE_DATA_OPTION = "json" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -72,12 +72,6 @@ ENABLE_GET_COMMENTS = True # 爬取一级评论的数量控制(单视频/帖子) CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10 -# 爬取作者粉丝和关注列表数量控制(单作者) -CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100 - -# 爬取作者动态粉丝和关注列表数量控制(单作者) -CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50 - # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 ENABLE_GET_SUB_COMMENTS = False @@ -206,4 +200,10 @@ END_DAY = '2024-01-01' # 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索 # 若为 False,则忽略 START_DAY 与 END_DAY 设置的值 # 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频 -ALL_DAY = False \ No newline at end of file +ALL_DAY = False + +# 爬取作者粉丝和关注列表数量控制(单作者) +CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100 + +# 爬取作者动态粉丝和关注列表数量控制(单作者) +CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50 \ No newline at end of file diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index f47519b..8c66747 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -512,8 +512,8 @@ class BilibiliCrawler(AbstractCrawler): "sign": creator_unhandled_info.get("sign"), "avatar": creator_unhandled_info.get("face"), } - # await self.get_fans(creator_info, semaphore) - # await self.get_followings(creator_info, semaphore) + await self.get_fans(creator_info, semaphore) + await self.get_followings(creator_info, semaphore) await self.get_dynamics(creator_info, semaphore) async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore): diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index bd208a8..3183358 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -179,9 +179,9 @@ async def batch_update_bilibili_creator_dynamics(creator_info: Dict, dynamics_li "text": dynamic_text, "type": dynamic_type, "pub_ts": dynamic_pub_ts, - "comment": dynamic_comment, - "forward": dynamic_forward, - "like": dynamic_like, + "total_comments": dynamic_comment, + "total_forwards": dynamic_forward, + "total_liked": dynamic_like, } await update_bilibili_creator_dynamic(creator_info=creator_info, dynamic_info=dynamic_info) @@ -210,9 +210,9 @@ async def update_bilibili_creator_dynamic(creator_info: Dict, dynamic_info: Dict "text": dynamic_info["text"], "type": dynamic_info["type"], "pub_ts": dynamic_info["pub_ts"], - "comment": dynamic_info["comment"], - "forward": dynamic_info["forward"], - "like": dynamic_info["like"], + "total_comments": dynamic_info["total_comments"], + "total_forwards": dynamic_info["total_forwards"], + "total_liked": dynamic_info["total_liked"], "last_modify_ts": utils.get_current_timestamp(), } diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index c888ff5..00ccd79 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -196,7 +196,7 @@ class BiliDbStoreImplement(AbstractStore): creator["add_ts"] = utils.get_current_timestamp() await add_new_creator(creator) else: - await update_creator_by_creator_id(creator_id,creator_item=creator) + await update_creator_by_creator_id(creator_id, creator_item=creator) async def store_contact(self, contact_item: Dict): """ @@ -232,6 +232,17 @@ class BiliDbStoreImplement(AbstractStore): """ + from .bilibili_store_sql import (add_new_dynamic, + query_dynamic_by_dynamic_id, + update_dynamic_by_dynamic_id) + + dynamic_id = dynamic_item.get("dynamic_id") + dynamic_detail = await query_dynamic_by_dynamic_id(dynamic_id=dynamic_id) + if not dynamic_detail: + dynamic_item["add_ts"] = utils.get_current_timestamp() + await add_new_dynamic(dynamic_item) + else: + await update_dynamic_by_dynamic_id(dynamic_id, dynamic_item=dynamic_item) class BiliJsonStoreImplement(AbstractStore): diff --git a/store/bilibili/bilibili_store_sql.py b/store/bilibili/bilibili_store_sql.py index 513b679..6ee4048 100644 --- a/store/bilibili/bilibili_store_sql.py +++ b/store/bilibili/bilibili_store_sql.py @@ -66,7 +66,6 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i return effect_row - async def query_comment_by_comment_id(comment_id: str) -> Dict: """ 查询一条评论内容 @@ -158,6 +157,7 @@ async def update_creator_by_creator_id(creator_id: str, creator_item: Dict) -> i effect_row: int = await async_db_conn.update_table("bilibili_up_info", creator_item, "user_id", creator_id) return effect_row + async def query_contact_by_up_and_fan(up_id: str, fan_id: str) -> Dict: """ 查询一条关联关系 @@ -204,3 +204,48 @@ async def update_contact_by_id(id: str, contact_item: Dict) -> int: effect_row: int = await async_db_conn.update_table("bilibili_contact_info", contact_item, "id", id) return effect_row + +async def query_dynamic_by_dynamic_id(dynamic_id: str) -> Dict: + """ + 查询一条动态信息 + Args: + dynamic_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from bilibili_up_dynamic where dynamic_id = '{dynamic_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_dynamic(dynamic_item: Dict) -> int: + """ + 新增动态信息 + Args: + dynamic_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("bilibili_up_dynamic", dynamic_item) + return last_row_id + + +async def update_dynamic_by_dynamic_id(dynamic_id: str, dynamic_item: Dict) -> int: + """ + 更新动态信息 + Args: + dynamic_id: + dynamic_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("bilibili_up_dynamic", dynamic_item, "dynamic_id", dynamic_id) + return effect_row