fix_words

This commit is contained in:
Bowenwin
2025-05-22 20:31:48 +08:00
parent a356358c21
commit 44e3d370ff
8 changed files with 338 additions and 59 deletions

View File

@@ -341,7 +341,8 @@ class BilibiliClient(AbstractApiClient):
return await self.get(uri, post_data)
async def get_creator_info(self, creator_id: int) -> Dict:
"""get creator info
"""
get creator info
:param creator_id: 作者 ID
"""
uri = "/x/space/wbi/acc/info"
@@ -355,7 +356,8 @@ class BilibiliClient(AbstractApiClient):
pn: int,
ps: int = 24,
) -> Dict:
"""get video comments
"""
get creator fans
:param creator_id: 创作者 ID
:param pn: 开始页数
:param ps: 每页数量
@@ -376,7 +378,8 @@ class BilibiliClient(AbstractApiClient):
pn: int,
ps: int = 24,
) -> Dict:
"""get video comments
"""
get creator followings
:param creator_id: 创作者 ID
:param pn: 开始页数
:param ps: 每页数量
@@ -391,11 +394,27 @@ class BilibiliClient(AbstractApiClient):
}
return await self.get(uri, post_data)
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
"""
get creator comments
:param creator_id: 创作者 ID
:param offset: 发送请求所需参数
:return:
"""
uri = "/x/polymer/web-dynamic/v1/feed/space"
post_data = {
"offset": offset,
"host_mid": creator_id,
"platform": "web",
}
return await self.get(uri, post_data)
async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 100) -> List:
"""
get video all comments include sub comments
get creator all fans
:param creator_info:
:param crawl_interval:
:param callback:
@@ -419,16 +438,13 @@ class BilibiliClient(AbstractApiClient):
if not fans_list:
break
result.extend(fans_list)
utils.logger.info(
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans successfully")
return result
async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 100) -> List:
"""
get video all comments include sub comments
get creator all followings
:param creator_info:
:param crawl_interval:
:param callback:
@@ -452,7 +468,33 @@ class BilibiliClient(AbstractApiClient):
if not followings_list:
break
result.extend(followings_list)
utils.logger.info(
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings successfully")
return result
async def get_creator_all_dynamics(self, creator_info: Dict, crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 20) -> List:
"""
get creator all followings
:param creator_info:
:param crawl_interval:
:param callback:
:param max_count: 一个up主爬取的最大动态数量
:return: up主关注者列表
"""
creator_id = creator_info["id"]
result = []
offset = ""
has_more = True
while has_more and len(result) < max_count:
dynamics_res = await self.get_creator_dynamics(creator_id, offset)
dynamics_list: List[Dict] = dynamics_res["items"]
has_more = dynamics_res["has_more"]
offset = dynamics_res["offset"]
if len(result) + len(dynamics_list) > max_count:
dynamics_list = dynamics_list[:max_count - len(result)]
if callback:
await callback(creator_info, dynamics_list)
await asyncio.sleep(crawl_interval)
result.extend(dynamics_list)
return result

View File

@@ -119,14 +119,16 @@ class BilibiliCrawler(AbstractCrawler):
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
if start_day > end_day:
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
raise ValueError(
'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
elif start_day == end_day: # 搜索同一天的内容
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
end_day = start_day + timedelta(days=1) - timedelta(
seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
else: # 搜索 start 至 end
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
# 将其重新转换为时间戳
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
async def search(self):
"""
search bilibili video with keywords
@@ -164,9 +166,11 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = []
try:
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
for video_item in video_list]
except Exception as e:
utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
utils.logger.warning(
f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
@@ -180,21 +184,23 @@ class BilibiliCrawler(AbstractCrawler):
else:
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
# 按照每一天进行爬取的时间戳参数
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
end=day.strftime('%Y-%m-%d'))
page = 1
#!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
#!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
#!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
# !该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
# !除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
# !除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
#! Catch any error if response return nothing, go to next day
# ! Catch any error if response return nothing, go to next day
try:
#! Don't skip any page, to make sure gather all video in one day
# ! Don't skip any page, to make sure gather all video in one day
# if page < start_page:
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
# page += 1
# continue
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
utils.logger.info(
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@@ -207,7 +213,9 @@ class BilibiliCrawler(AbstractCrawler):
video_list: List[Dict] = videos_res.get("result")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
task_list = [
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
video_item in video_list]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
@@ -467,7 +475,6 @@ class BilibiliCrawler(AbstractCrawler):
extension_file_name = f"video.mp4"
await bilibili_store.store_video(aid, content, extension_file_name)
async def get_all_creator_details(self, creator_id_list: List[int]):
"""
creator_id_list: get details for creator from creator_id_list
@@ -485,7 +492,8 @@ class BilibiliCrawler(AbstractCrawler):
creator_id, semaphore), name=creator_id)
task_list.append(task)
except Exception as e:
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
utils.logger.warning(
f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
await asyncio.gather(*task_list)
@@ -504,8 +512,9 @@ class BilibiliCrawler(AbstractCrawler):
"sign": creator_unhandled_info.get("sign"),
"avatar": creator_unhandled_info.get("face"),
}
await self.get_fans(creator_info, semaphore)
await self.get_followings(creator_info, semaphore)
# await self.get_fans(creator_info, semaphore)
# await self.get_followings(creator_info, semaphore)
await self.get_dynamics(creator_info, semaphore)
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
"""
@@ -523,7 +532,7 @@ class BilibiliCrawler(AbstractCrawler):
creator_info=creator_info,
crawl_interval=random.random(),
callback=bilibili_store.batch_update_bilibili_creator_fans,
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
@@ -549,7 +558,7 @@ class BilibiliCrawler(AbstractCrawler):
creator_info=creator_info,
crawl_interval=random.random(),
callback=bilibili_store.batch_update_bilibili_creator_followings,
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
@@ -558,3 +567,29 @@ class BilibiliCrawler(AbstractCrawler):
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
"""
get dynamics for creator id
:param creator_info:
:param semaphore:
:return:
"""
creator_id = creator_info["id"]
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
await self.bili_client.get_creator_all_dynamics(
creator_info=creator_info,
crawl_interval=random.random(),
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")