mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-04-21 19:27:40 +08:00
fix_words
This commit is contained in:
@@ -341,7 +341,8 @@ class BilibiliClient(AbstractApiClient):
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_info(self, creator_id: int) -> Dict:
|
||||
"""get creator info
|
||||
"""
|
||||
get creator info
|
||||
:param creator_id: 作者 ID
|
||||
"""
|
||||
uri = "/x/space/wbi/acc/info"
|
||||
@@ -355,7 +356,8 @@ class BilibiliClient(AbstractApiClient):
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
"""
|
||||
get creator fans
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
@@ -376,7 +378,8 @@ class BilibiliClient(AbstractApiClient):
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
"""
|
||||
get creator followings
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
@@ -391,11 +394,27 @@ class BilibiliClient(AbstractApiClient):
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
|
||||
"""
|
||||
get creator comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param offset: 发送请求所需参数
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/polymer/web-dynamic/v1/feed/space"
|
||||
post_data = {
|
||||
"offset": offset,
|
||||
"host_mid": creator_id,
|
||||
"platform": "web",
|
||||
}
|
||||
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100) -> List:
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
get creator all fans
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
@@ -419,16 +438,13 @@ class BilibiliClient(AbstractApiClient):
|
||||
if not fans_list:
|
||||
break
|
||||
result.extend(fans_list)
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100) -> List:
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
get creator all followings
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
@@ -452,7 +468,33 @@ class BilibiliClient(AbstractApiClient):
|
||||
if not followings_list:
|
||||
break
|
||||
result.extend(followings_list)
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def get_creator_all_dynamics(self, creator_info: Dict, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 20) -> List:
|
||||
"""
|
||||
get creator all followings
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大动态数量
|
||||
|
||||
:return: up主关注者列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
offset = ""
|
||||
has_more = True
|
||||
while has_more and len(result) < max_count:
|
||||
dynamics_res = await self.get_creator_dynamics(creator_id, offset)
|
||||
dynamics_list: List[Dict] = dynamics_res["items"]
|
||||
has_more = dynamics_res["has_more"]
|
||||
offset = dynamics_res["offset"]
|
||||
if len(result) + len(dynamics_list) > max_count:
|
||||
dynamics_list = dynamics_list[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(creator_info, dynamics_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(dynamics_list)
|
||||
return result
|
||||
|
||||
@@ -119,14 +119,16 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
|
||||
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
|
||||
if start_day > end_day:
|
||||
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
|
||||
raise ValueError(
|
||||
'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
|
||||
elif start_day == end_day: # 搜索同一天的内容
|
||||
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
end_day = start_day + timedelta(days=1) - timedelta(
|
||||
seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
else: # 搜索 start 至 end
|
||||
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||
# 将其重新转换为时间戳
|
||||
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search bilibili video with keywords
|
||||
@@ -164,9 +166,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = []
|
||||
try:
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
|
||||
for video_item in video_list]
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
|
||||
utils.logger.warning(
|
||||
f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
for video_item in video_items:
|
||||
if video_item:
|
||||
@@ -180,21 +184,23 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
else:
|
||||
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
|
||||
# 按照每一天进行爬取的时间戳参数
|
||||
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
|
||||
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
|
||||
end=day.strftime('%Y-%m-%d'))
|
||||
page = 1
|
||||
#!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
|
||||
#!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
|
||||
#!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
|
||||
# !该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
|
||||
# !除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
|
||||
# !除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
|
||||
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
#! Catch any error if response return nothing, go to next day
|
||||
# ! Catch any error if response return nothing, go to next day
|
||||
try:
|
||||
#! Don't skip any page, to make sure gather all video in one day
|
||||
# ! Don't skip any page, to make sure gather all video in one day
|
||||
# if page < start_page:
|
||||
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
||||
# page += 1
|
||||
# continue
|
||||
|
||||
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.bili_client.search_video_by_keyword(
|
||||
keyword=keyword,
|
||||
@@ -207,7 +213,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||
task_list = [
|
||||
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
|
||||
video_item in video_list]
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
for video_item in video_items:
|
||||
if video_item:
|
||||
@@ -467,7 +475,6 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
extension_file_name = f"video.mp4"
|
||||
await bilibili_store.store_video(aid, content, extension_file_name)
|
||||
|
||||
|
||||
async def get_all_creator_details(self, creator_id_list: List[int]):
|
||||
"""
|
||||
creator_id_list: get details for creator from creator_id_list
|
||||
@@ -485,7 +492,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
creator_id, semaphore), name=creator_id)
|
||||
task_list.append(task)
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||
utils.logger.warning(
|
||||
f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
@@ -504,8 +512,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
"sign": creator_unhandled_info.get("sign"),
|
||||
"avatar": creator_unhandled_info.get("face"),
|
||||
}
|
||||
await self.get_fans(creator_info, semaphore)
|
||||
await self.get_followings(creator_info, semaphore)
|
||||
# await self.get_fans(creator_info, semaphore)
|
||||
# await self.get_followings(creator_info, semaphore)
|
||||
await self.get_dynamics(creator_info, semaphore)
|
||||
|
||||
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
@@ -523,7 +532,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_fans,
|
||||
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
@@ -549,7 +558,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_followings,
|
||||
max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
@@ -558,3 +567,29 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
|
||||
|
||||
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get dynamics for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
|
||||
await self.bili_client.get_creator_all_dynamics(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
|
||||
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")
|
||||
|
||||
Reference in New Issue
Block a user