get_fans_and_get_followings

This commit is contained in:
Bowenwin
2025-05-19 19:57:36 +08:00
parent 654260cbce
commit a356358c21
5 changed files with 308 additions and 22 deletions

View File

@@ -21,6 +21,7 @@ from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext, Page
import config
from base.base_crawler import AbstractApiClient
from tools import utils
@@ -223,7 +224,7 @@ class BilibiliClient(AbstractApiClient):
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
callback: Optional[Callable] = None,
max_count: int = 10,):
max_count: int = 10, ):
"""
get video all comments include sub comments
:param video_id:
@@ -250,7 +251,7 @@ class BilibiliClient(AbstractApiClient):
if (comment.get("rcount", 0) > 0):
{
await self.get_video_all_level_two_comments(
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
}
if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)]
@@ -320,7 +321,8 @@ class BilibiliClient(AbstractApiClient):
result = await self.get(uri, post_data)
return result
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30,
order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
"""get all videos for a creator
:param creator_id: 创作者 ID
:param pn: 页数
@@ -337,3 +339,120 @@ class BilibiliClient(AbstractApiClient):
"order": order_mode,
}
return await self.get(uri, post_data)
async def get_creator_info(self, creator_id: int) -> Dict:
"""get creator info
:param creator_id: 作者 ID
"""
uri = "/x/space/wbi/acc/info"
post_data = {
"mid": creator_id,
}
return await self.get(uri, post_data)
async def get_creator_fans(self,
creator_id: int,
pn: int,
ps: int = 24,
) -> Dict:
"""get video comments
:param creator_id: 创作者 ID
:param pn: 开始页数
:param ps: 每页数量
:return:
"""
uri = "/x/relation/fans"
post_data = {
'vmid': creator_id,
"pn": pn,
"ps": ps,
"gaia_source": "main_web",
}
return await self.get(uri, post_data)
async def get_creator_followings(self,
creator_id: int,
pn: int,
ps: int = 24,
) -> Dict:
"""get video comments
:param creator_id: 创作者 ID
:param pn: 开始页数
:param ps: 每页数量
:return:
"""
uri = "/x/relation/followings"
post_data = {
"vmid": creator_id,
"pn": pn,
"ps": ps,
"gaia_source": "main_web",
}
return await self.get(uri, post_data)
async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 100) -> List:
"""
get video all comments include sub comments
:param creator_info:
:param crawl_interval:
:param callback:
:param max_count: 一个up主爬取的最大粉丝数量
:return: up主粉丝数列表
"""
creator_id = creator_info["id"]
result = []
pn = config.START_CONTACTS_PAGE
while len(result) < max_count:
fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn)
fans_list: List[Dict] = fans_res.get("list", [])
pn += 1
if len(result) + len(fans_list) > max_count:
fans_list = fans_list[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数
await callback(creator_info, fans_list)
await asyncio.sleep(crawl_interval)
if not fans_list:
break
result.extend(fans_list)
utils.logger.info(
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans successfully")
return result
async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 100) -> List:
"""
get video all comments include sub comments
:param creator_info:
:param crawl_interval:
:param callback:
:param max_count: 一个up主爬取的最大关注者数量
:return: up主关注者列表
"""
creator_id = creator_info["id"]
result = []
pn = config.START_CONTACTS_PAGE
while len(result) < max_count:
followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn)
followings_list: List[Dict] = followings_res.get("list", [])
pn += 1
if len(result) + len(followings_list) > max_count:
followings_list = followings_list[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数
await callback(creator_info, followings_list)
await asyncio.sleep(crawl_interval)
if not followings_list:
break
result.extend(followings_list)
utils.logger.info(
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings successfully")
return result

View File

@@ -89,8 +89,9 @@ class BilibiliCrawler(AbstractCrawler):
# Get the information and comments of the specified post
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
for creator_id in config.BILI_CREATOR_ID_LIST:
await self.get_creator_videos(int(creator_id))
# for creator_id in config.BILI_CREATOR_ID_LIST:
# await self.get_creator_videos(int(creator_id))
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
else:
pass
utils.logger.info(
@@ -466,3 +467,94 @@ class BilibiliCrawler(AbstractCrawler):
extension_file_name = f"video.mp4"
await bilibili_store.store_video(aid, content, extension_file_name)
async def get_all_creator_details(self, creator_id_list: List[int]):
"""
creator_id_list: get details for creator from creator_id_list
"""
utils.logger.info(
f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
utils.logger.info(
f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
try:
for creator_id in creator_id_list:
task = asyncio.create_task(self.get_creator_details(
creator_id, semaphore), name=creator_id)
task_list.append(task)
except Exception as e:
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
await asyncio.gather(*task_list)
async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore):
"""
get details for creator id
:param creator_id:
:param semaphore:
:return:
"""
async with semaphore:
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
creator_info: Dict = {
"id": creator_id,
"name": creator_unhandled_info.get("name"),
"sign": creator_unhandled_info.get("sign"),
"avatar": creator_unhandled_info.get("face"),
}
await self.get_fans(creator_info, semaphore)
await self.get_followings(creator_info, semaphore)
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
"""
get fans for creator id
:param creator_info:
:param semaphore:
:return:
"""
creator_id = creator_info["id"]
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
await self.bili_client.get_creator_all_fans(
creator_info=creator_info,
crawl_interval=random.random(),
callback=bilibili_store.batch_update_bilibili_creator_fans,
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
"""
get followings for creator id
:param creator_info:
:param semaphore:
:return:
"""
creator_id = creator_info["id"]
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
await self.bili_client.get_creator_all_followings(
creator_info=creator_info,
crawl_interval=random.random(),
callback=bilibili_store.batch_update_bilibili_creator_followings,
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")