mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-08 19:07:33 +08:00
feat: kuaishou support url link
This commit is contained in:
@@ -26,6 +26,7 @@ from playwright.async_api import (
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import kuaishou as kuaishou_store
|
||||
from tools import utils
|
||||
@@ -34,6 +35,7 @@ from var import comment_tasks_var, crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import KuaiShouClient
|
||||
from .exception import DataFetchError
|
||||
from .help import parse_video_info_from_url, parse_creator_info_from_url
|
||||
from .login import KuaishouLogin
|
||||
|
||||
|
||||
@@ -168,16 +170,27 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
async def get_specified_videos(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
utils.logger.info("[KuaishouCrawler.get_specified_videos] Parsing video URLs...")
|
||||
video_ids = []
|
||||
for video_url in config.KS_SPECIFIED_ID_LIST:
|
||||
try:
|
||||
video_info = parse_video_info_from_url(video_url)
|
||||
video_ids.append(video_info.video_id)
|
||||
utils.logger.info(f"Parsed video ID: {video_info.video_id} from {video_url}")
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"Failed to parse video URL: {e}")
|
||||
continue
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
|
||||
for video_id in config.KS_SPECIFIED_ID_LIST
|
||||
for video_id in video_ids
|
||||
]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
|
||||
await self.batch_get_video_comments(video_ids)
|
||||
|
||||
async def get_video_info_task(
|
||||
self, video_id: str, semaphore: asyncio.Semaphore
|
||||
@@ -367,11 +380,20 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
|
||||
)
|
||||
for user_id in config.KS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
for creator_url in config.KS_CREATOR_ID_LIST:
|
||||
try:
|
||||
# Parse creator URL to get user_id
|
||||
creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
|
||||
utils.logger.info(f"[KuaiShouCrawler.get_creators_and_videos] Parse creator URL info: {creator_info}")
|
||||
user_id = creator_info.user_id
|
||||
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"[KuaiShouCrawler.get_creators_and_videos] Failed to parse creator URL: {e}")
|
||||
continue
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.ks_client.get_all_videos_by_creator(
|
||||
|
||||
Reference in New Issue
Block a user