From a356358c211c267d37ce1565fdd8ed82b4f6efa1 Mon Sep 17 00:00:00 2001 From: Bowenwin Date: Mon, 19 May 2025 19:57:36 +0800 Subject: [PATCH] get_fans_and_get_followings --- config/base_config.py | 18 +++- media_platform/bilibili/client.py | 125 +++++++++++++++++++++++++- media_platform/bilibili/core.py | 96 +++++++++++++++++++- store/bilibili/__init__.py | 63 ++++++++++--- store/bilibili/bilibili_store_impl.py | 28 +++++- 5 files changed, 308 insertions(+), 22 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index 2f94975..75154cd 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -10,16 +10,16 @@ # 基础配置 -PLATFORM = "xhs" +PLATFORM = "bili" KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔 -LOGIN_TYPE = "qrcode" # qrcode or phone or cookie +LOGIN_TYPE = "phone" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音 PUBLISH_TIME_TYPE = 0 CRAWLER_TYPE = ( - "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) + "creator" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) ) # 自定义User Agent(暂时仅对XHS有效) UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0' @@ -54,6 +54,9 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name # 爬取开始页数 默认从第一页开始 START_PAGE = 1 +# 爬取粉丝列表开始页数 默认从第一页开始 +START_CONTACTS_PAGE = 1 + # 爬取视频/帖子的数量控制 CRAWLER_MAX_NOTES_COUNT = 200 @@ -69,6 +72,9 @@ ENABLE_GET_COMMENTS = True # 爬取一级评论的数量控制(单视频/帖子) CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10 +# 爬取作者粉丝和关注列表数量控制(单作者) +CRAWLER_MAX_FANS_COUNT_SINGLENOTES = 100 + # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 ENABLE_GET_SUB_COMMENTS = False @@ -144,7 +150,11 @@ DY_CREATOR_ID_LIST = [ # 指定bili创作者ID列表(sec_id) BILI_CREATOR_ID_LIST = [ - "20813884", + # "20813884", + "520819684", + "472747194", + "519872016", + "372201438", # ........................ ] diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 37c087b..7ec6b35 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -21,6 +21,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page +import config from base.base_crawler import AbstractApiClient from tools import utils @@ -223,7 +224,7 @@ class BilibiliClient(AbstractApiClient): async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, - max_count: int = 10,): + max_count: int = 10, ): """ get video all comments include sub comments :param video_id: @@ -250,7 +251,7 @@ class BilibiliClient(AbstractApiClient): if (comment.get("rcount", 0) > 0): { await self.get_video_all_level_two_comments( - video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback) + video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback) } if len(result) + len(comment_list) > max_count: comment_list = comment_list[:max_count - len(result)] @@ -320,7 +321,8 @@ class BilibiliClient(AbstractApiClient): result = await self.get(uri, post_data) return result - async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: + async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, + order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: """get all videos for a creator :param creator_id: 创作者 ID :param pn: 页数 @@ -337,3 +339,120 @@ class BilibiliClient(AbstractApiClient): "order": order_mode, } return await self.get(uri, post_data) + + async def get_creator_info(self, creator_id: int) -> Dict: + """get creator info + :param creator_id: 作者 ID + """ + uri = "/x/space/wbi/acc/info" + post_data = { + "mid": creator_id, + } + return await self.get(uri, post_data) + + async def get_creator_fans(self, + creator_id: int, + pn: int, + ps: int = 24, + ) -> Dict: + """get video comments + :param creator_id: 创作者 ID + :param pn: 开始页数 + :param ps: 每页数量 + :return: + """ + uri = "/x/relation/fans" + post_data = { + 'vmid': creator_id, + "pn": pn, + "ps": ps, + "gaia_source": "main_web", + + } + return await self.get(uri, post_data) + + async def get_creator_followings(self, + creator_id: int, + pn: int, + ps: int = 24, + ) -> Dict: + """get video comments + :param creator_id: 创作者 ID + :param pn: 开始页数 + :param ps: 每页数量 + :return: + """ + uri = "/x/relation/followings" + post_data = { + "vmid": creator_id, + "pn": pn, + "ps": ps, + "gaia_source": "main_web", + } + return await self.get(uri, post_data) + + async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 100) -> List: + """ + get video all comments include sub comments + :param creator_info: + :param crawl_interval: + :param callback: + :param max_count: 一个up主爬取的最大粉丝数量 + + :return: up主粉丝数列表 + """ + creator_id = creator_info["id"] + result = [] + pn = config.START_CONTACTS_PAGE + while len(result) < max_count: + fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn) + fans_list: List[Dict] = fans_res.get("list", []) + + pn += 1 + if len(result) + len(fans_list) > max_count: + fans_list = fans_list[:max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(creator_info, fans_list) + await asyncio.sleep(crawl_interval) + if not fans_list: + break + result.extend(fans_list) + utils.logger.info( + f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans successfully") + + return result + + async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 100) -> List: + """ + get video all comments include sub comments + :param creator_info: + :param crawl_interval: + :param callback: + :param max_count: 一个up主爬取的最大关注者数量 + + :return: up主关注者列表 + """ + creator_id = creator_info["id"] + result = [] + pn = config.START_CONTACTS_PAGE + while len(result) < max_count: + followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn) + followings_list: List[Dict] = followings_res.get("list", []) + + pn += 1 + if len(result) + len(followings_list) > max_count: + followings_list = followings_list[:max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(creator_info, followings_list) + await asyncio.sleep(crawl_interval) + if not followings_list: + break + result.extend(followings_list) + utils.logger.info( + f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings successfully") + + return result diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 5c7949a..eb6c014 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -89,8 +89,9 @@ class BilibiliCrawler(AbstractCrawler): # Get the information and comments of the specified post await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST) elif config.CRAWLER_TYPE == "creator": - for creator_id in config.BILI_CREATOR_ID_LIST: - await self.get_creator_videos(int(creator_id)) + # for creator_id in config.BILI_CREATOR_ID_LIST: + # await self.get_creator_videos(int(creator_id)) + await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST) else: pass utils.logger.info( @@ -466,3 +467,94 @@ class BilibiliCrawler(AbstractCrawler): extension_file_name = f"video.mp4" await bilibili_store.store_video(aid, content, extension_file_name) + + async def get_all_creator_details(self, creator_id_list: List[int]): + """ + creator_id_list: get details for creator from creator_id_list + """ + utils.logger.info( + f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator") + utils.logger.info( + f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}") + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + try: + for creator_id in creator_id_list: + task = asyncio.create_task(self.get_creator_details( + creator_id, semaphore), name=creator_id) + task_list.append(task) + except Exception as e: + utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}") + + await asyncio.gather(*task_list) + + async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore): + """ + get details for creator id + :param creator_id: + :param semaphore: + :return: + """ + async with semaphore: + creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id) + creator_info: Dict = { + "id": creator_id, + "name": creator_unhandled_info.get("name"), + "sign": creator_unhandled_info.get("sign"), + "avatar": creator_unhandled_info.get("face"), + } + await self.get_fans(creator_info, semaphore) + await self.get_followings(creator_info, semaphore) + + async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore): + """ + get fans for creator id + :param creator_info: + :param semaphore: + :return: + """ + creator_id = creator_info["id"] + async with semaphore: + try: + utils.logger.info( + f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...") + await self.bili_client.get_creator_all_fans( + creator_info=creator_info, + crawl_interval=random.random(), + callback=bilibili_store.batch_update_bilibili_creator_fans, + max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES, + ) + + except DataFetchError as ex: + utils.logger.error( + f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}") + except Exception as e: + utils.logger.error( + f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}") + + async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore): + """ + get followings for creator id + :param creator_info: + :param semaphore: + :return: + """ + creator_id = creator_info["id"] + async with semaphore: + try: + utils.logger.info( + f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...") + await self.bili_client.get_creator_all_followings( + creator_info=creator_info, + crawl_interval=random.random(), + callback=bilibili_store.batch_update_bilibili_creator_followings, + max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES, + ) + + except DataFetchError as ex: + utils.logger.error( + f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}") + except Exception as e: + utils.logger.error( + f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}") diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index dcffa88..c417dcc 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -71,25 +71,25 @@ async def update_bilibili_video(video_item: Dict): await BiliStoreFactory.create_store().store_content(content_item=save_content_item) -async def update_up_info(video_item: Dict): +async def update_up_info(video_item: Dict): video_item_card_list: Dict = video_item.get("Card") - video_item_card: Dict = video_item_card_list.get("card") + video_item_card: Dict = video_item_card_list.get("card") saver_up_info = { - "user_id": str(video_item_card.get("mid")), - "nickname": video_item_card.get("name"), + "user_id": str(video_item_card.get("mid")), + "nickname": video_item_card.get("name"), "sex": video_item_card.get("sex"), "sign": video_item_card.get("sign"), - "avatar": video_item_card.get("face"), - "last_modify_ts": utils.get_current_timestamp(), - "total_fans": video_item_card.get("fans"), - "total_liked": video_item_card_list.get("like_num"), - "user_rank": video_item_card.get("level_info").get("current_level"), - "is_official": video_item_card.get("official_verify").get("type"), + "avatar": video_item_card.get("face"), + "last_modify_ts": utils.get_current_timestamp(), + "total_fans": video_item_card.get("fans"), + "total_liked": video_item_card_list.get("like_num"), + "user_rank": video_item_card.get("level_info").get("current_level"), + "is_official": video_item_card.get("official_verify").get("type"), } utils.logger.info( f"[store.bilibili.update_up_info] bilibili user_id:{video_item_card.get('mid')}") await BiliStoreFactory.create_store().store_creator(creator=saver_up_info) - + async def batch_update_bilibili_video_comments(video_id: str, comments: List[Dict]): if not comments: @@ -132,3 +132,44 @@ async def store_video(aid, video_content, extension_file_name): """ await BilibiliVideo().store_video( {"aid": aid, "video_content": video_content, "extension_file_name": extension_file_name}) + + +async def batch_update_bilibili_creator_fans(creator_info: Dict, fans_list: List[Dict]): + if not fans_list: + return + for fan_item in fans_list: + fan_info: Dict = { + "id": fan_item.get("mid"), + "name": fan_item.get("uname"), + "sign": fan_item.get("sign"), + "avatar": fan_item.get("face"), + } + await update_bilibili_creator_fans(creator_info=creator_info, fan_info=fan_info) + + +async def batch_update_bilibili_creator_followings(creator_info: Dict, followings_list: List[Dict]): + if not followings_list: + return + for following_item in followings_list: + following_info: Dict = { + "id": following_item.get("mid"), + "name": following_item.get("uname"), + "sign": following_item.get("sign"), + "avatar": following_item.get("face"), + } + await update_bilibili_creator_fans(creator_info=following_info, fan_info=creator_info) + + +async def update_bilibili_creator_fans(creator_info: Dict, fan_info: Dict): + save_contact_item = { + "up_id": creator_info["id"], + "fan_id": fan_info["id"], + "up_name": creator_info["name"], + "fan_name": fan_info["name"], + "up_sign": creator_info["sign"], + "fan_sign": fan_info["sign"], + "up_avatar": creator_info["avatar"], + "fan_avatar": fan_info["avatar"] + } + + await BiliStoreFactory.create_store().store_creator_contact(contact_item=save_contact_item) diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index 17c410a..20d09f1 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -107,6 +107,18 @@ class BiliCsvStoreImplement(AbstractStore): """ await self.save_data_to_csv(save_item=creator, store_type="creators") + async def store_creator_contact(self, contact_item: Dict): + """ + Bilibili comment CSV storage implementation + Args: + contact_item: creator's contact item dict + + Returns: + + """ + + await self.save_data_to_csv(save_item=contact_item, store_type="fans") + class BiliDbStoreImplement(AbstractStore): async def store_content(self, content_item: Dict): @@ -239,7 +251,7 @@ class BiliJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: @@ -250,7 +262,7 @@ class BiliJsonStoreImplement(AbstractStore): async def store_creator(self, creator: Dict): """ - creator JSON storage implementatio + creator JSON storage implementation Args: creator: @@ -258,3 +270,15 @@ class BiliJsonStoreImplement(AbstractStore): """ await self.save_data_to_json(creator, "creators") + + async def store_creator_contact(self, contact_item: Dict): + """ + creator contact JSON storage implementation + Args: + contact_item: creator's contact item dict + + Returns: + + """ + + await self.save_data_to_json(save_item=contact_item, store_type="fans")