# -*- coding: utf-8 -*- # Copyright (c) 2025 relakkes@gmail.com # # This file is part of MediaCrawler project. # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/douyin/help.py # GitHub: https://github.com/NanmiCoder # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 # # 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: # 1. 不得用于任何商业用途。 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 # 3. 不得进行大规模爬取或对平台造成运营干扰。 # 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 # # 详细许可条款请参阅项目根目录下的LICENSE文件。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Name : 程序员阿江-Relakkes # @Time : 2024/6/10 02:24 # @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除 import random import re from typing import Optional import execjs from playwright.async_api import Page from model.m_douyin import VideoUrlInfo, CreatorUrlInfo from tools.crawler_util import extract_url_params_to_dict douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read()) def get_web_id(): """ 生成随机的webid Returns: """ def e(t): if t is not None: return str(t ^ (int(16 * random.random()) >> (t // 4))) else: return ''.join( [str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))] ) web_id = ''.join( e(int(x)) if x in '018' else x for x in e(None) ) return web_id.replace('-', '')[:19] async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None): """ 获取 a_bogus 参数, 目前不支持post请求类型的签名 """ return get_a_bogus_from_js(url, params, user_agent) def get_a_bogus_from_js(url: str, params: str, user_agent: str): """ 通过js获取 a_bogus 参数 Args: url: params: user_agent: Returns: """ sign_js_name = "sign_datail" if "/reply" in url: sign_js_name = "sign_reply" return douyin_sign_obj.call(sign_js_name, params, user_agent) async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page): """ 通过playright获取 a_bogus 参数 playwright版本已失效 Returns: """ if not post_data: post_data = "" a_bogus = await page.evaluate( "([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])", [params, post_data, user_agent]) return a_bogus def parse_video_info_from_url(url: str) -> VideoUrlInfo: """ 从抖音视频URL中解析出视频ID 支持以下格式: 1. 普通视频链接: https://www.douyin.com/video/7525082444551310602 2. 带modal_id参数的链接: - https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?modal_id=7525082444551310602 - https://www.douyin.com/root/search/python?modal_id=7471165520058862848 3. 短链接: https://v.douyin.com/iF12345ABC/ (需要client解析) 4. 纯ID: 7525082444551310602 Args: url: 抖音视频链接或ID Returns: VideoUrlInfo: 包含视频ID的对象 """ # 如果是纯数字ID,直接返回 if url.isdigit(): return VideoUrlInfo(aweme_id=url, url_type="normal") # 检查是否是短链接 (v.douyin.com) if "v.douyin.com" in url or url.startswith("http") and len(url) < 50 and "video" not in url: return VideoUrlInfo(aweme_id="", url_type="short") # 需要通过client解析 # 尝试从URL参数中提取modal_id params = extract_url_params_to_dict(url) modal_id = params.get("modal_id") if modal_id: return VideoUrlInfo(aweme_id=modal_id, url_type="modal") # 从标准视频URL中提取ID: /video/数字 video_pattern = r'/video/(\d+)' match = re.search(video_pattern, url) if match: aweme_id = match.group(1) return VideoUrlInfo(aweme_id=aweme_id, url_type="normal") raise ValueError(f"无法从URL中解析出视频ID: {url}") def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: """ 从抖音创作者主页URL中解析出创作者ID (sec_user_id) 支持以下格式: 1. 创作者主页: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main 2. 纯ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE Args: url: 抖音创作者主页链接或sec_user_id Returns: CreatorUrlInfo: 包含创作者ID的对象 """ # 如果是纯ID格式(通常以MS4wLjABAAAA开头),直接返回 if url.startswith("MS4wLjABAAAA") or (not url.startswith("http") and "douyin.com" not in url): return CreatorUrlInfo(sec_user_id=url) # 从创作者主页URL中提取sec_user_id: /user/xxx user_pattern = r'/user/([^/?]+)' match = re.search(user_pattern, url) if match: sec_user_id = match.group(1) return CreatorUrlInfo(sec_user_id=sec_user_id) raise ValueError(f"无法从URL中解析出创作者ID: {url}") if __name__ == '__main__': # 测试视频URL解析 print("=== 视频URL解析测试 ===") test_urls = [ "https://www.douyin.com/video/7525082444551310602", "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525082444551310602", "https://www.douyin.com/root/search/python?aid=b733a3b0-4662-4639-9a72-c2318fba9f3f&modal_id=7471165520058862848&type=general", "7525082444551310602", ] for url in test_urls: try: result = parse_video_info_from_url(url) print(f"✓ URL: {url[:80]}...") print(f" 结果: {result}\n") except Exception as e: print(f"✗ URL: {url}") print(f" 错误: {e}\n") # 测试创作者URL解析 print("=== 创作者URL解析测试 ===") test_creator_urls = [ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main", "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE", ] for url in test_creator_urls: try: result = parse_creator_info_from_url(url) print(f"✓ URL: {url[:80]}...") print(f" 结果: {result}\n") except Exception as e: print(f"✗ URL: {url}") print(f" 错误: {e}\n")