快手指定创作者主页和二级评论

This commit is contained in:
HIRO
2024-06-13 14:49:07 +08:00
parent 645ec729f6
commit a001556ba7
12 changed files with 401 additions and 27 deletions

View File

@@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler):
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
# Search for videos and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_videos()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their videos and comments
await self.get_creators_and_videos()
else:
pass
@@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}")
return None
async def batch_get_video_comments(self, video_id_list: List[str]):
@@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler):
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled")
return
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
@@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler):
return playwright_proxy, httpx_proxy
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
"""Create xhs client"""
"""Create ks client"""
utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = KuaiShouClient(
ks_client_obj = KuaiShouClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
@@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler):
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return xhs_client_obj
return ks_client_obj
async def launch_browser(
self,
@@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler):
)
return browser_context
async def get_creators_and_videos(self) -> None:
"""Get creator's videos and retrieve their comment information."""
utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators")
for user_id in config.KS_CREATOR_ID_LIST:
# get creator detail info from web html content
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
if createor_info:
await kuaishou_store.save_creator(user_id, creator=createor_info)
# Get all video information of the creator
all_video_list = await self.ks_client.get_all_videos_by_creator(
user_id = user_id,
crawl_interval = random.random(),
callback = self.fetch_creator_video_detail
)
video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list]
await self.batch_get_video_comments(video_ids)
async def fetch_creator_video_detail(self, video_list: List[Dict]):
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list
]
video_details = await asyncio.gather(*task_list)
for video_detail in video_details:
if video_detail is not None:
await kuaishou_store.update_kuaishou_video(video_detail)
async def close(self):
"""Close browser context"""
await self.browser_context.close()