mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-04 21:20:47 +08:00
get_fans_and_get_followings
This commit is contained in:
@@ -10,16 +10,16 @@
|
||||
|
||||
|
||||
# 基础配置
|
||||
PLATFORM = "xhs"
|
||||
PLATFORM = "bili"
|
||||
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
LOGIN_TYPE = "phone" # qrcode or phone or cookie
|
||||
COOKIES = ""
|
||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
||||
SORT_TYPE = "popularity_descending"
|
||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音
|
||||
PUBLISH_TIME_TYPE = 0
|
||||
CRAWLER_TYPE = (
|
||||
"search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||
"creator" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||
)
|
||||
# 自定义User Agent(暂时仅对XHS有效)
|
||||
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
|
||||
@@ -54,6 +54,9 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
# 爬取开始页数 默认从第一页开始
|
||||
START_PAGE = 1
|
||||
|
||||
# 爬取粉丝列表开始页数 默认从第一页开始
|
||||
START_CONTACTS_PAGE = 1
|
||||
|
||||
# 爬取视频/帖子的数量控制
|
||||
CRAWLER_MAX_NOTES_COUNT = 200
|
||||
|
||||
@@ -69,6 +72,9 @@ ENABLE_GET_COMMENTS = True
|
||||
# 爬取一级评论的数量控制(单视频/帖子)
|
||||
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
|
||||
|
||||
# 爬取作者粉丝和关注列表数量控制(单作者)
|
||||
CRAWLER_MAX_FANS_COUNT_SINGLENOTES = 100
|
||||
|
||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||
ENABLE_GET_SUB_COMMENTS = False
|
||||
@@ -144,7 +150,11 @@ DY_CREATOR_ID_LIST = [
|
||||
|
||||
# 指定bili创作者ID列表(sec_id)
|
||||
BILI_CREATOR_ID_LIST = [
|
||||
"20813884",
|
||||
# "20813884",
|
||||
"520819684",
|
||||
"472747194",
|
||||
"519872016",
|
||||
"372201438",
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ from urllib.parse import urlencode
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
|
||||
@@ -223,7 +224,7 @@ class BilibiliClient(AbstractApiClient):
|
||||
|
||||
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,):
|
||||
max_count: int = 10, ):
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param video_id:
|
||||
@@ -250,7 +251,7 @@ class BilibiliClient(AbstractApiClient):
|
||||
if (comment.get("rcount", 0) > 0):
|
||||
{
|
||||
await self.get_video_all_level_two_comments(
|
||||
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
|
||||
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
|
||||
}
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
@@ -320,7 +321,8 @@ class BilibiliClient(AbstractApiClient):
|
||||
result = await self.get(uri, post_data)
|
||||
return result
|
||||
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30,
|
||||
order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
"""get all videos for a creator
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 页数
|
||||
@@ -337,3 +339,120 @@ class BilibiliClient(AbstractApiClient):
|
||||
"order": order_mode,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_info(self, creator_id: int) -> Dict:
|
||||
"""get creator info
|
||||
:param creator_id: 作者 ID
|
||||
"""
|
||||
uri = "/x/space/wbi/acc/info"
|
||||
post_data = {
|
||||
"mid": creator_id,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_fans(self,
|
||||
creator_id: int,
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/fans"
|
||||
post_data = {
|
||||
'vmid': creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"gaia_source": "main_web",
|
||||
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_followings(self,
|
||||
creator_id: int,
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/followings"
|
||||
post_data = {
|
||||
"vmid": creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"gaia_source": "main_web",
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100) -> List:
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大粉丝数量
|
||||
|
||||
:return: up主粉丝数列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
pn = config.START_CONTACTS_PAGE
|
||||
while len(result) < max_count:
|
||||
fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn)
|
||||
fans_list: List[Dict] = fans_res.get("list", [])
|
||||
|
||||
pn += 1
|
||||
if len(result) + len(fans_list) > max_count:
|
||||
fans_list = fans_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(creator_info, fans_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not fans_list:
|
||||
break
|
||||
result.extend(fans_list)
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100) -> List:
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大关注者数量
|
||||
|
||||
:return: up主关注者列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
pn = config.START_CONTACTS_PAGE
|
||||
while len(result) < max_count:
|
||||
followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn)
|
||||
followings_list: List[Dict] = followings_res.get("list", [])
|
||||
|
||||
pn += 1
|
||||
if len(result) + len(followings_list) > max_count:
|
||||
followings_list = followings_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(creator_info, followings_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not followings_list:
|
||||
break
|
||||
result.extend(followings_list)
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings successfully")
|
||||
|
||||
return result
|
||||
|
||||
@@ -89,8 +89,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||
await self.get_creator_videos(int(creator_id))
|
||||
# for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||
# await self.get_creator_videos(int(creator_id))
|
||||
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
|
||||
else:
|
||||
pass
|
||||
utils.logger.info(
|
||||
@@ -466,3 +467,94 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
extension_file_name = f"video.mp4"
|
||||
await bilibili_store.store_video(aid, content, extension_file_name)
|
||||
|
||||
|
||||
async def get_all_creator_details(self, creator_id_list: List[int]):
|
||||
"""
|
||||
creator_id_list: get details for creator from creator_id_list
|
||||
"""
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
try:
|
||||
for creator_id in creator_id_list:
|
||||
task = asyncio.create_task(self.get_creator_details(
|
||||
creator_id, semaphore), name=creator_id)
|
||||
task_list.append(task)
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get details for creator id
|
||||
:param creator_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
|
||||
creator_info: Dict = {
|
||||
"id": creator_id,
|
||||
"name": creator_unhandled_info.get("name"),
|
||||
"sign": creator_unhandled_info.get("sign"),
|
||||
"avatar": creator_unhandled_info.get("face"),
|
||||
}
|
||||
await self.get_fans(creator_info, semaphore)
|
||||
await self.get_followings(creator_info, semaphore)
|
||||
|
||||
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get fans for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
|
||||
await self.bili_client.get_creator_all_fans(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_fans,
|
||||
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
|
||||
|
||||
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get followings for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
|
||||
await self.bili_client.get_creator_all_followings(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_followings,
|
||||
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
|
||||
|
||||
@@ -71,25 +71,25 @@ async def update_bilibili_video(video_item: Dict):
|
||||
await BiliStoreFactory.create_store().store_content(content_item=save_content_item)
|
||||
|
||||
|
||||
async def update_up_info(video_item: Dict):
|
||||
async def update_up_info(video_item: Dict):
|
||||
video_item_card_list: Dict = video_item.get("Card")
|
||||
video_item_card: Dict = video_item_card_list.get("card")
|
||||
video_item_card: Dict = video_item_card_list.get("card")
|
||||
saver_up_info = {
|
||||
"user_id": str(video_item_card.get("mid")),
|
||||
"nickname": video_item_card.get("name"),
|
||||
"user_id": str(video_item_card.get("mid")),
|
||||
"nickname": video_item_card.get("name"),
|
||||
"sex": video_item_card.get("sex"),
|
||||
"sign": video_item_card.get("sign"),
|
||||
"avatar": video_item_card.get("face"),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
"total_fans": video_item_card.get("fans"),
|
||||
"total_liked": video_item_card_list.get("like_num"),
|
||||
"user_rank": video_item_card.get("level_info").get("current_level"),
|
||||
"is_official": video_item_card.get("official_verify").get("type"),
|
||||
"avatar": video_item_card.get("face"),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
"total_fans": video_item_card.get("fans"),
|
||||
"total_liked": video_item_card_list.get("like_num"),
|
||||
"user_rank": video_item_card.get("level_info").get("current_level"),
|
||||
"is_official": video_item_card.get("official_verify").get("type"),
|
||||
}
|
||||
utils.logger.info(
|
||||
f"[store.bilibili.update_up_info] bilibili user_id:{video_item_card.get('mid')}")
|
||||
await BiliStoreFactory.create_store().store_creator(creator=saver_up_info)
|
||||
|
||||
|
||||
|
||||
async def batch_update_bilibili_video_comments(video_id: str, comments: List[Dict]):
|
||||
if not comments:
|
||||
@@ -132,3 +132,44 @@ async def store_video(aid, video_content, extension_file_name):
|
||||
"""
|
||||
await BilibiliVideo().store_video(
|
||||
{"aid": aid, "video_content": video_content, "extension_file_name": extension_file_name})
|
||||
|
||||
|
||||
async def batch_update_bilibili_creator_fans(creator_info: Dict, fans_list: List[Dict]):
|
||||
if not fans_list:
|
||||
return
|
||||
for fan_item in fans_list:
|
||||
fan_info: Dict = {
|
||||
"id": fan_item.get("mid"),
|
||||
"name": fan_item.get("uname"),
|
||||
"sign": fan_item.get("sign"),
|
||||
"avatar": fan_item.get("face"),
|
||||
}
|
||||
await update_bilibili_creator_fans(creator_info=creator_info, fan_info=fan_info)
|
||||
|
||||
|
||||
async def batch_update_bilibili_creator_followings(creator_info: Dict, followings_list: List[Dict]):
|
||||
if not followings_list:
|
||||
return
|
||||
for following_item in followings_list:
|
||||
following_info: Dict = {
|
||||
"id": following_item.get("mid"),
|
||||
"name": following_item.get("uname"),
|
||||
"sign": following_item.get("sign"),
|
||||
"avatar": following_item.get("face"),
|
||||
}
|
||||
await update_bilibili_creator_fans(creator_info=following_info, fan_info=creator_info)
|
||||
|
||||
|
||||
async def update_bilibili_creator_fans(creator_info: Dict, fan_info: Dict):
|
||||
save_contact_item = {
|
||||
"up_id": creator_info["id"],
|
||||
"fan_id": fan_info["id"],
|
||||
"up_name": creator_info["name"],
|
||||
"fan_name": fan_info["name"],
|
||||
"up_sign": creator_info["sign"],
|
||||
"fan_sign": fan_info["sign"],
|
||||
"up_avatar": creator_info["avatar"],
|
||||
"fan_avatar": fan_info["avatar"]
|
||||
}
|
||||
|
||||
await BiliStoreFactory.create_store().store_creator_contact(contact_item=save_contact_item)
|
||||
|
||||
@@ -107,6 +107,18 @@ class BiliCsvStoreImplement(AbstractStore):
|
||||
"""
|
||||
await self.save_data_to_csv(save_item=creator, store_type="creators")
|
||||
|
||||
async def store_creator_contact(self, contact_item: Dict):
|
||||
"""
|
||||
Bilibili comment CSV storage implementation
|
||||
Args:
|
||||
contact_item: creator's contact item dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
await self.save_data_to_csv(save_item=contact_item, store_type="fans")
|
||||
|
||||
|
||||
class BiliDbStoreImplement(AbstractStore):
|
||||
async def store_content(self, content_item: Dict):
|
||||
@@ -239,7 +251,7 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
comment JSON storage implementatio
|
||||
comment JSON storage implementation
|
||||
Args:
|
||||
comment_item:
|
||||
|
||||
@@ -250,7 +262,7 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
creator JSON storage implementatio
|
||||
creator JSON storage implementation
|
||||
Args:
|
||||
creator:
|
||||
|
||||
@@ -258,3 +270,15 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
await self.save_data_to_json(creator, "creators")
|
||||
|
||||
async def store_creator_contact(self, contact_item: Dict):
|
||||
"""
|
||||
creator contact JSON storage implementation
|
||||
Args:
|
||||
contact_item: creator's contact item dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
await self.save_data_to_json(save_item=contact_item, store_type="fans")
|
||||
|
||||
Reference in New Issue
Block a user