Files
MediaCrawler/media_platform/douyin/help.py
2026-01-18 22:22:31 +08:00

199 lines
6.8 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/douyin/help.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Name : 程序员阿江-Relakkes
# @Time : 2024/6/10 02:24
# @Desc : Get a_bogus parameter, for learning and communication only, do not use for commercial purposes, contact author to delete if infringement
import random
import re
from typing import Optional
import execjs
from playwright.async_api import Page
from model.m_douyin import VideoUrlInfo, CreatorUrlInfo
from tools.crawler_util import extract_url_params_to_dict
douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read())
def get_web_id():
"""
Generate random webid
Returns:
"""
def e(t):
if t is not None:
return str(t ^ (int(16 * random.random()) >> (t // 4)))
else:
return ''.join(
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
)
web_id = ''.join(
e(int(x)) if x in '018' else x for x in e(None)
)
return web_id.replace('-', '')[:19]
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
"""
Get a_bogus parameter, currently does not support POST request type signature
"""
return get_a_bogus_from_js(url, params, user_agent)
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
"""
Get a_bogus parameter through js
Args:
url:
params:
user_agent:
Returns:
"""
sign_js_name = "sign_datail"
if "/reply" in url:
sign_js_name = "sign_reply"
return douyin_sign_obj.call(sign_js_name, params, user_agent)
async def get_a_bogus_from_playwright(params: str, post_data: dict, user_agent: str, page: Page):
"""
Get a_bogus parameter through playwright
playwright version is deprecated
Returns:
"""
if not post_data:
post_data = ""
a_bogus = await page.evaluate(
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
[params, post_data, user_agent])
return a_bogus
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
"""
Parse video ID from Douyin video URL
Supports the following formats:
1. Normal video link: https://www.douyin.com/video/7525082444551310602
2. Link with modal_id parameter:
- https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?modal_id=7525082444551310602
- https://www.douyin.com/root/search/python?modal_id=7471165520058862848
3. Short link: https://v.douyin.com/iF12345ABC/ (requires client parsing)
4. Pure ID: 7525082444551310602
Args:
url: Douyin video link or ID
Returns:
VideoUrlInfo: Object containing video ID
"""
# If it's a pure numeric ID, return directly
if url.isdigit():
return VideoUrlInfo(aweme_id=url, url_type="normal")
# Check if it's a short link (v.douyin.com)
if "v.douyin.com" in url or url.startswith("http") and len(url) < 50 and "video" not in url:
return VideoUrlInfo(aweme_id="", url_type="short") # Requires client parsing
# Try to extract modal_id from URL parameters
params = extract_url_params_to_dict(url)
modal_id = params.get("modal_id")
if modal_id:
return VideoUrlInfo(aweme_id=modal_id, url_type="modal")
# Extract ID from standard video URL: /video/number
video_pattern = r'/video/(\d+)'
match = re.search(video_pattern, url)
if match:
aweme_id = match.group(1)
return VideoUrlInfo(aweme_id=aweme_id, url_type="normal")
raise ValueError(f"Unable to parse video ID from URL: {url}")
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
"""
Parse creator ID (sec_user_id) from Douyin creator homepage URL
Supports the following formats:
1. Creator homepage: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main
2. Pure ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE
Args:
url: Douyin creator homepage link or sec_user_id
Returns:
CreatorUrlInfo: Object containing creator ID
"""
# If it's a pure ID format (usually starts with MS4wLjABAAAA), return directly
if url.startswith("MS4wLjABAAAA") or (not url.startswith("http") and "douyin.com" not in url):
return CreatorUrlInfo(sec_user_id=url)
# Extract sec_user_id from creator homepage URL: /user/xxx
user_pattern = r'/user/([^/?]+)'
match = re.search(user_pattern, url)
if match:
sec_user_id = match.group(1)
return CreatorUrlInfo(sec_user_id=sec_user_id)
raise ValueError(f"Unable to parse creator ID from URL: {url}")
if __name__ == '__main__':
# Test video URL parsing
print("=== Video URL Parsing Test ===")
test_urls = [
"https://www.douyin.com/video/7525082444551310602",
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525082444551310602",
"https://www.douyin.com/root/search/python?aid=b733a3b0-4662-4639-9a72-c2318fba9f3f&modal_id=7471165520058862848&type=general",
"7525082444551310602",
]
for url in test_urls:
try:
result = parse_video_info_from_url(url)
print(f"✓ URL: {url[:80]}...")
print(f" Result: {result}\n")
except Exception as e:
print(f"✗ URL: {url}")
print(f" Error: {e}\n")
# Test creator URL parsing
print("=== Creator URL Parsing Test ===")
test_creator_urls = [
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
]
for url in test_creator_urls:
try:
result = parse_creator_info_from_url(url)
print(f"✓ URL: {url[:80]}...")
print(f" Result: {result}\n")
except Exception as e:
print(f"✗ URL: {url}")
print(f" Error: {e}\n")