From ae7955787c8e3be7e5b32f62db14a1fed37d50aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Sat, 18 Oct 2025 07:40:10 +0800 Subject: [PATCH] feat: kuaishou support url link --- config/base_config.py | 2 +- config/ks_config.py | 17 +++++- media_platform/kuaishou/core.py | 36 +++++++++--- media_platform/kuaishou/help.py | 99 +++++++++++++++++++++++++++++++++ model/m_kuaishou.py | 29 +++++++--- 5 files changed, 164 insertions(+), 19 deletions(-) create mode 100644 media_platform/kuaishou/help.py diff --git a/config/base_config.py b/config/base_config.py index af0592b..1722ae1 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -38,7 +38,7 @@ SAVE_LOGIN_STATE = True # 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力 # 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制 # 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险 -ENABLE_CDP_MODE = False +ENABLE_CDP_MODE = True # CDP调试端口,用于与浏览器通信 # 如果端口被占用,系统会自动尝试下一个可用端口 diff --git a/config/ks_config.py b/config/ks_config.py index 962b457..d84d4a7 100644 --- a/config/ks_config.py +++ b/config/ks_config.py @@ -10,11 +10,22 @@ # 快手平台配置 -# 指定快手视频ID列表 -KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"] +# 指定快手视频URL列表 (支持完整URL或纯ID) +# 支持格式: +# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search" +# 2. 纯视频ID: "3xf8enb8dbj6uig" +KS_SPECIFIED_ID_LIST = [ + "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python", + "3xf8enb8dbj6uig", + # ........................ +] -# 指定快手用户ID列表 +# 指定快手创作者URL列表 (支持完整URL或纯ID) +# 支持格式: +# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs" +# 2. 纯user_id: "3x4sm73aye7jq7i" KS_CREATOR_ID_LIST = [ + "https://www.kuaishou.com/profile/3x84qugg4ch9zhs", "3x4sm73aye7jq7i", # ........................ ] diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index cdbe373..9e11a7f 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -26,6 +26,7 @@ from playwright.async_api import ( import config from base.base_crawler import AbstractCrawler +from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import kuaishou as kuaishou_store from tools import utils @@ -34,6 +35,7 @@ from var import comment_tasks_var, crawler_type_var, source_keyword_var from .client import KuaiShouClient from .exception import DataFetchError +from .help import parse_video_info_from_url, parse_creator_info_from_url from .login import KuaishouLogin @@ -168,16 +170,27 @@ class KuaishouCrawler(AbstractCrawler): async def get_specified_videos(self): """Get the information and comments of the specified post""" + utils.logger.info("[KuaishouCrawler.get_specified_videos] Parsing video URLs...") + video_ids = [] + for video_url in config.KS_SPECIFIED_ID_LIST: + try: + video_info = parse_video_info_from_url(video_url) + video_ids.append(video_info.video_id) + utils.logger.info(f"Parsed video ID: {video_info.video_id} from {video_url}") + except ValueError as e: + utils.logger.error(f"Failed to parse video URL: {e}") + continue + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ self.get_video_info_task(video_id=video_id, semaphore=semaphore) - for video_id in config.KS_SPECIFIED_ID_LIST + for video_id in video_ids ] video_details = await asyncio.gather(*task_list) for video_detail in video_details: if video_detail is not None: await kuaishou_store.update_kuaishou_video(video_detail) - await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST) + await self.batch_get_video_comments(video_ids) async def get_video_info_task( self, video_id: str, semaphore: asyncio.Semaphore @@ -367,11 +380,20 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.info( "[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators" ) - for user_id in config.KS_CREATOR_ID_LIST: - # get creator detail info from web html content - createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id) - if createor_info: - await kuaishou_store.save_creator(user_id, creator=createor_info) + for creator_url in config.KS_CREATOR_ID_LIST: + try: + # Parse creator URL to get user_id + creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url) + utils.logger.info(f"[KuaiShouCrawler.get_creators_and_videos] Parse creator URL info: {creator_info}") + user_id = creator_info.user_id + + # get creator detail info from web html content + createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id) + if createor_info: + await kuaishou_store.save_creator(user_id, creator=createor_info) + except ValueError as e: + utils.logger.error(f"[KuaiShouCrawler.get_creators_and_videos] Failed to parse creator URL: {e}") + continue # Get all video information of the creator all_video_list = await self.ks_client.get_all_videos_by_creator( diff --git a/media_platform/kuaishou/help.py b/media_platform/kuaishou/help.py new file mode 100644 index 0000000..5015f2d --- /dev/null +++ b/media_platform/kuaishou/help.py @@ -0,0 +1,99 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- + +import re +from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo + + +def parse_video_info_from_url(url: str) -> VideoUrlInfo: + """ + 从快手视频URL中解析出视频ID + 支持以下格式: + 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search" + 2. 纯视频ID: "3x3zxz4mjrsc8ke" + + Args: + url: 快手视频链接或视频ID + Returns: + VideoUrlInfo: 包含视频ID的对象 + """ + # 如果不包含http且不包含kuaishou.com,认为是纯ID + if not url.startswith("http") and "kuaishou.com" not in url: + return VideoUrlInfo(video_id=url, url_type="normal") + + # 从标准视频URL中提取ID: /short-video/视频ID + video_pattern = r'/short-video/([a-zA-Z0-9_-]+)' + match = re.search(video_pattern, url) + if match: + video_id = match.group(1) + return VideoUrlInfo(video_id=video_id, url_type="normal") + + raise ValueError(f"无法从URL中解析出视频ID: {url}") + + +def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: + """ + 从快手创作者主页URL中解析出创作者ID + 支持以下格式: + 1. 创作者主页: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs" + 2. 纯ID: "3x4sm73aye7jq7i" + + Args: + url: 快手创作者主页链接或user_id + Returns: + CreatorUrlInfo: 包含创作者ID的对象 + """ + # 如果不包含http且不包含kuaishou.com,认为是纯ID + if not url.startswith("http") and "kuaishou.com" not in url: + return CreatorUrlInfo(user_id=url) + + # 从创作者主页URL中提取user_id: /profile/xxx + user_pattern = r'/profile/([a-zA-Z0-9_-]+)' + match = re.search(user_pattern, url) + if match: + user_id = match.group(1) + return CreatorUrlInfo(user_id=user_id) + + raise ValueError(f"无法从URL中解析出创作者ID: {url}") + + +if __name__ == '__main__': + # 测试视频URL解析 + print("=== 视频URL解析测试 ===") + test_video_urls = [ + "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python", + "3xf8enb8dbj6uig", + ] + for url in test_video_urls: + try: + result = parse_video_info_from_url(url) + print(f"✓ URL: {url[:80]}...") + print(f" 结果: {result}\n") + except Exception as e: + print(f"✗ URL: {url}") + print(f" 错误: {e}\n") + + # 测试创作者URL解析 + print("=== 创作者URL解析测试 ===") + test_creator_urls = [ + "https://www.kuaishou.com/profile/3x84qugg4ch9zhs", + "3x4sm73aye7jq7i", + ] + for url in test_creator_urls: + try: + result = parse_creator_info_from_url(url) + print(f"✓ URL: {url[:80]}...") + print(f" 结果: {result}\n") + except Exception as e: + print(f"✗ URL: {url}") + print(f" 错误: {e}\n") diff --git a/model/m_kuaishou.py b/model/m_kuaishou.py index e907b1d..b7c2080 100644 --- a/model/m_kuaishou.py +++ b/model/m_kuaishou.py @@ -1,12 +1,25 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # -*- coding: utf-8 -*- + +from pydantic import BaseModel, Field + + +class VideoUrlInfo(BaseModel): + """快手视频URL信息""" + video_id: str = Field(title="video id (photo id)") + url_type: str = Field(default="normal", title="url type: normal") + + +class CreatorUrlInfo(BaseModel): + """快手创作者URL信息""" + user_id: str = Field(title="user id (creator id)")