mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-26 18:20:47 +08:00
feat: kuaishou support url link
This commit is contained in:
@@ -38,7 +38,7 @@ SAVE_LOGIN_STATE = True
|
||||
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
|
||||
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
|
||||
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
|
||||
ENABLE_CDP_MODE = False
|
||||
ENABLE_CDP_MODE = True
|
||||
|
||||
# CDP调试端口,用于与浏览器通信
|
||||
# 如果端口被占用,系统会自动尝试下一个可用端口
|
||||
|
||||
@@ -10,11 +10,22 @@
|
||||
|
||||
# 快手平台配置
|
||||
|
||||
# 指定快手视频ID列表
|
||||
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
|
||||
# 指定快手视频URL列表 (支持完整URL或纯ID)
|
||||
# 支持格式:
|
||||
# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
|
||||
# 2. 纯视频ID: "3xf8enb8dbj6uig"
|
||||
KS_SPECIFIED_ID_LIST = [
|
||||
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
|
||||
"3xf8enb8dbj6uig",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定快手用户ID列表
|
||||
# 指定快手创作者URL列表 (支持完整URL或纯ID)
|
||||
# 支持格式:
|
||||
# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
|
||||
# 2. 纯user_id: "3x4sm73aye7jq7i"
|
||||
KS_CREATOR_ID_LIST = [
|
||||
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
|
||||
"3x4sm73aye7jq7i",
|
||||
# ........................
|
||||
]
|
||||
|
||||
@@ -26,6 +26,7 @@ from playwright.async_api import (
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import kuaishou as kuaishou_store
|
||||
from tools import utils
|
||||
@@ -34,6 +35,7 @@ from var import comment_tasks_var, crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import KuaiShouClient
|
||||
from .exception import DataFetchError
|
||||
from .help import parse_video_info_from_url, parse_creator_info_from_url
|
||||
from .login import KuaishouLogin
|
||||
|
||||
|
||||
@@ -168,16 +170,27 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
async def get_specified_videos(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
utils.logger.info("[KuaishouCrawler.get_specified_videos] Parsing video URLs...")
|
||||
video_ids = []
|
||||
for video_url in config.KS_SPECIFIED_ID_LIST:
|
||||
try:
|
||||
video_info = parse_video_info_from_url(video_url)
|
||||
video_ids.append(video_info.video_id)
|
||||
utils.logger.info(f"Parsed video ID: {video_info.video_id} from {video_url}")
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"Failed to parse video URL: {e}")
|
||||
continue
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
|
||||
for video_id in config.KS_SPECIFIED_ID_LIST
|
||||
for video_id in video_ids
|
||||
]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
|
||||
await self.batch_get_video_comments(video_ids)
|
||||
|
||||
async def get_video_info_task(
|
||||
self, video_id: str, semaphore: asyncio.Semaphore
|
||||
@@ -367,11 +380,20 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
utils.logger.info(
|
||||
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
|
||||
)
|
||||
for user_id in config.KS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
for creator_url in config.KS_CREATOR_ID_LIST:
|
||||
try:
|
||||
# Parse creator URL to get user_id
|
||||
creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
|
||||
utils.logger.info(f"[KuaiShouCrawler.get_creators_and_videos] Parse creator URL info: {creator_info}")
|
||||
user_id = creator_info.user_id
|
||||
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"[KuaiShouCrawler.get_creators_and_videos] Failed to parse creator URL: {e}")
|
||||
continue
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.ks_client.get_all_videos_by_creator(
|
||||
|
||||
99
media_platform/kuaishou/help.py
Normal file
99
media_platform/kuaishou/help.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
|
||||
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从快手视频URL中解析出视频ID
|
||||
支持以下格式:
|
||||
1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
|
||||
2. 纯视频ID: "3x3zxz4mjrsc8ke"
|
||||
|
||||
Args:
|
||||
url: 快手视频链接或视频ID
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
"""
|
||||
# 如果不包含http且不包含kuaishou.com,认为是纯ID
|
||||
if not url.startswith("http") and "kuaishou.com" not in url:
|
||||
return VideoUrlInfo(video_id=url, url_type="normal")
|
||||
|
||||
# 从标准视频URL中提取ID: /short-video/视频ID
|
||||
video_pattern = r'/short-video/([a-zA-Z0-9_-]+)'
|
||||
match = re.search(video_pattern, url)
|
||||
if match:
|
||||
video_id = match.group(1)
|
||||
return VideoUrlInfo(video_id=video_id, url_type="normal")
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从快手创作者主页URL中解析出创作者ID
|
||||
支持以下格式:
|
||||
1. 创作者主页: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
|
||||
2. 纯ID: "3x4sm73aye7jq7i"
|
||||
|
||||
Args:
|
||||
url: 快手创作者主页链接或user_id
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
"""
|
||||
# 如果不包含http且不包含kuaishou.com,认为是纯ID
|
||||
if not url.startswith("http") and "kuaishou.com" not in url:
|
||||
return CreatorUrlInfo(user_id=url)
|
||||
|
||||
# 从创作者主页URL中提取user_id: /profile/xxx
|
||||
user_pattern = r'/profile/([a-zA-Z0-9_-]+)'
|
||||
match = re.search(user_pattern, url)
|
||||
if match:
|
||||
user_id = match.group(1)
|
||||
return CreatorUrlInfo(user_id=user_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试视频URL解析
|
||||
print("=== 视频URL解析测试 ===")
|
||||
test_video_urls = [
|
||||
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
|
||||
"3xf8enb8dbj6uig",
|
||||
]
|
||||
for url in test_video_urls:
|
||||
try:
|
||||
result = parse_video_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
|
||||
# 测试创作者URL解析
|
||||
print("=== 创作者URL解析测试 ===")
|
||||
test_creator_urls = [
|
||||
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
|
||||
"3x4sm73aye7jq7i",
|
||||
]
|
||||
for url in test_creator_urls:
|
||||
try:
|
||||
result = parse_creator_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
@@ -1,12 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class VideoUrlInfo(BaseModel):
|
||||
"""快手视频URL信息"""
|
||||
video_id: str = Field(title="video id (photo id)")
|
||||
url_type: str = Field(default="normal", title="url type: normal")
|
||||
|
||||
|
||||
class CreatorUrlInfo(BaseModel):
|
||||
"""快手创作者URL信息"""
|
||||
user_id: str = Field(title="user id (creator id)")
|
||||
|
||||
Reference in New Issue
Block a user