finish_all_for_expand_bili

This commit is contained in:
Bowenwin
2025-05-22 22:26:30 +08:00
parent 59619fff0a
commit 66843f216a
12 changed files with 51 additions and 63 deletions

View File

@@ -224,7 +224,7 @@ class BilibiliClient(AbstractApiClient):
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
callback: Optional[Callable] = None,
max_count: int = 10, ):
max_count: int = 10,):
"""
get video all comments include sub comments
:param video_id:
@@ -251,7 +251,7 @@ class BilibiliClient(AbstractApiClient):
if (comment.get("rcount", 0) > 0):
{
await self.get_video_all_level_two_comments(
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
}
if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)]
@@ -321,8 +321,7 @@ class BilibiliClient(AbstractApiClient):
result = await self.get(uri, post_data)
return result
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30,
order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
"""get all videos for a creator
:param creator_id: 创作者 ID
:param pn: 页数

View File

@@ -89,9 +89,11 @@ class BilibiliCrawler(AbstractCrawler):
# Get the information and comments of the specified post
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
# for creator_id in config.BILI_CREATOR_ID_LIST:
# await self.get_creator_videos(int(creator_id))
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
if config.CREATOR_MODE:
for creator_id in config.BILI_CREATOR_ID_LIST:
await self.get_creator_videos(int(creator_id))
else:
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
else:
pass
utils.logger.info(
@@ -119,11 +121,9 @@ class BilibiliCrawler(AbstractCrawler):
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
if start_day > end_day:
raise ValueError(
'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
elif start_day == end_day: # 搜索同一天的内容
end_day = start_day + timedelta(days=1) - timedelta(
seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
else: # 搜索 start 至 end
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
# 将其重新转换为时间戳
@@ -166,11 +166,9 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = []
try:
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
for video_item in video_list]
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
except Exception as e:
utils.logger.warning(
f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
@@ -184,23 +182,21 @@ class BilibiliCrawler(AbstractCrawler):
else:
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
# 按照每一天进行爬取的时间戳参数
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
end=day.strftime('%Y-%m-%d'))
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
page = 1
# !该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
# !除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
# !除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
#!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
#!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
#!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
# ! Catch any error if response return nothing, go to next day
#! Catch any error if response return nothing, go to next day
try:
# ! Don't skip any page, to make sure gather all video in one day
#! Don't skip any page, to make sure gather all video in one day
# if page < start_page:
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
# page += 1
# continue
utils.logger.info(
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@@ -213,9 +209,7 @@ class BilibiliCrawler(AbstractCrawler):
video_list: List[Dict] = videos_res.get("result")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
video_item in video_list]
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item: