mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-03 20:50:47 +08:00
Update Chinese comments, variable descriptions, and metadata across multiple configuration and core files to English. This improves codebase accessibility for international developers. Additionally, removed the sponsorship section from README files.
458 lines
21 KiB
Python
458 lines
21 KiB
Python
# -*- coding: utf-8 -*-
|
||
# Copyright (c) 2025 relakkes@gmail.com
|
||
#
|
||
# This file is part of MediaCrawler project.
|
||
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/douyin/core.py
|
||
# GitHub: https://github.com/NanmiCoder
|
||
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||
#
|
||
|
||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||
# 1. 不得用于任何商业用途。
|
||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||
# 5. 不得用于任何非法或不当的用途。
|
||
#
|
||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||
|
||
import asyncio
|
||
import os
|
||
import random
|
||
from asyncio import Task
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
from playwright.async_api import (
|
||
BrowserContext,
|
||
BrowserType,
|
||
Page,
|
||
Playwright,
|
||
async_playwright,
|
||
)
|
||
|
||
import config
|
||
from base.base_crawler import AbstractCrawler
|
||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||
from store import douyin as douyin_store
|
||
from tools import utils
|
||
from tools.cdp_browser import CDPBrowserManager
|
||
from var import crawler_type_var, source_keyword_var
|
||
|
||
from .client import DouYinClient
|
||
from .exception import DataFetchError
|
||
from .field import PublishTimeType
|
||
from .help import parse_video_info_from_url, parse_creator_info_from_url
|
||
from .login import DouYinLogin
|
||
|
||
|
||
class DouYinCrawler(AbstractCrawler):
|
||
context_page: Page
|
||
dy_client: DouYinClient
|
||
browser_context: BrowserContext
|
||
cdp_manager: Optional[CDPBrowserManager]
|
||
|
||
def __init__(self) -> None:
|
||
self.index_url = "https://www.douyin.com"
|
||
self.cdp_manager = None
|
||
self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh
|
||
|
||
async def start(self) -> None:
|
||
playwright_proxy_format, httpx_proxy_format = None, None
|
||
if config.ENABLE_IP_PROXY:
|
||
self.ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
|
||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||
|
||
async with async_playwright() as playwright:
|
||
# Select startup mode based on configuration
|
||
if config.ENABLE_CDP_MODE:
|
||
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
|
||
self.browser_context = await self.launch_browser_with_cdp(
|
||
playwright,
|
||
playwright_proxy_format,
|
||
None,
|
||
headless=config.CDP_HEADLESS,
|
||
)
|
||
else:
|
||
utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器")
|
||
# Launch a browser context.
|
||
chromium = playwright.chromium
|
||
self.browser_context = await self.launch_browser(
|
||
chromium,
|
||
playwright_proxy_format,
|
||
user_agent=None,
|
||
headless=config.HEADLESS,
|
||
)
|
||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||
|
||
self.context_page = await self.browser_context.new_page()
|
||
await self.context_page.goto(self.index_url)
|
||
|
||
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
|
||
if not await self.dy_client.pong(browser_context=self.browser_context):
|
||
login_obj = DouYinLogin(
|
||
login_type=config.LOGIN_TYPE,
|
||
login_phone="", # you phone number
|
||
browser_context=self.browser_context,
|
||
context_page=self.context_page,
|
||
cookie_str=config.COOKIES,
|
||
)
|
||
await login_obj.begin()
|
||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||
if config.CRAWLER_TYPE == "search":
|
||
# Search for notes and retrieve their comment information.
|
||
await self.search()
|
||
elif config.CRAWLER_TYPE == "detail":
|
||
# Get the information and comments of the specified post
|
||
await self.get_specified_awemes()
|
||
elif config.CRAWLER_TYPE == "creator":
|
||
# Get the information and comments of the specified creator
|
||
await self.get_creators_and_videos()
|
||
|
||
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
|
||
|
||
async def search(self) -> None:
|
||
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
|
||
dy_limit_count = 10 # douyin limit page fixed value
|
||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||
start_page = config.START_PAGE # start page number
|
||
for keyword in config.KEYWORDS.split(","):
|
||
source_keyword_var.set(keyword)
|
||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||
aweme_list: List[str] = []
|
||
page = 0
|
||
dy_search_id = ""
|
||
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||
if page < start_page:
|
||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||
page += 1
|
||
continue
|
||
try:
|
||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
|
||
posts_res = await self.dy_client.search_info_by_keyword(
|
||
keyword=keyword,
|
||
offset=page * dy_limit_count - dy_limit_count,
|
||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||
search_id=dy_search_id,
|
||
)
|
||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||
break
|
||
except DataFetchError:
|
||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||
break
|
||
|
||
page += 1
|
||
if "data" not in posts_res:
|
||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
|
||
break
|
||
dy_search_id = posts_res.get("extra", {}).get("logid", "")
|
||
page_aweme_list = []
|
||
for post_item in posts_res.get("data"):
|
||
try:
|
||
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
||
except TypeError:
|
||
continue
|
||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||
page_aweme_list.append(aweme_info.get("aweme_id", ""))
|
||
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
||
await self.get_aweme_media(aweme_item=aweme_info)
|
||
|
||
# Batch get note comments for the current page
|
||
await self.batch_get_note_comments(page_aweme_list)
|
||
|
||
# Sleep after each page navigation
|
||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||
|
||
async def get_specified_awemes(self):
|
||
"""Get the information and comments of the specified post from URLs or IDs"""
|
||
utils.logger.info("[DouYinCrawler.get_specified_awemes] Parsing video URLs...")
|
||
aweme_id_list = []
|
||
for video_url in config.DY_SPECIFIED_ID_LIST:
|
||
try:
|
||
video_info = parse_video_info_from_url(video_url)
|
||
|
||
# Handling short links
|
||
if video_info.url_type == "short":
|
||
utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Resolving short link: {video_url}")
|
||
resolved_url = await self.dy_client.resolve_short_url(video_url)
|
||
if resolved_url:
|
||
# Extract video ID from parsed URL
|
||
video_info = parse_video_info_from_url(resolved_url)
|
||
utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Short link resolved to aweme ID: {video_info.aweme_id}")
|
||
else:
|
||
utils.logger.error(f"[DouYinCrawler.get_specified_awemes] Failed to resolve short link: {video_url}")
|
||
continue
|
||
|
||
aweme_id_list.append(video_info.aweme_id)
|
||
utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Parsed aweme ID: {video_info.aweme_id} from {video_url}")
|
||
except ValueError as e:
|
||
utils.logger.error(f"[DouYinCrawler.get_specified_awemes] Failed to parse video URL: {e}")
|
||
continue
|
||
|
||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||
task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in aweme_id_list]
|
||
aweme_details = await asyncio.gather(*task_list)
|
||
for aweme_detail in aweme_details:
|
||
if aweme_detail is not None:
|
||
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
|
||
await self.get_aweme_media(aweme_item=aweme_detail)
|
||
await self.batch_get_note_comments(aweme_id_list)
|
||
|
||
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
|
||
"""Get note detail"""
|
||
async with semaphore:
|
||
try:
|
||
result = await self.dy_client.get_video_by_id(aweme_id)
|
||
# Sleep after fetching aweme detail
|
||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||
utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}")
|
||
return result
|
||
except DataFetchError as ex:
|
||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
|
||
return None
|
||
except KeyError as ex:
|
||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||
return None
|
||
|
||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||
"""
|
||
Batch get note comments
|
||
"""
|
||
if not config.ENABLE_GET_COMMENTS:
|
||
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||
return
|
||
|
||
task_list: List[Task] = []
|
||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||
for aweme_id in aweme_list:
|
||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||
task_list.append(task)
|
||
if len(task_list) > 0:
|
||
await asyncio.wait(task_list)
|
||
|
||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||
async with semaphore:
|
||
try:
|
||
# Pass the list of keywords to the get_aweme_all_comments method
|
||
# Use fixed crawling interval
|
||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||
await self.dy_client.get_aweme_all_comments(
|
||
aweme_id=aweme_id,
|
||
crawl_interval=crawl_interval,
|
||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||
callback=douyin_store.batch_update_dy_aweme_comments,
|
||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||
)
|
||
# Sleep after fetching comments
|
||
await asyncio.sleep(crawl_interval)
|
||
utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}")
|
||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||
except DataFetchError as e:
|
||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||
|
||
async def get_creators_and_videos(self) -> None:
|
||
"""
|
||
Get the information and videos of the specified creator from URLs or IDs
|
||
"""
|
||
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
|
||
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Parsing creator URLs...")
|
||
|
||
for creator_url in config.DY_CREATOR_ID_LIST:
|
||
try:
|
||
creator_info_parsed = parse_creator_info_from_url(creator_url)
|
||
user_id = creator_info_parsed.sec_user_id
|
||
utils.logger.info(f"[DouYinCrawler.get_creators_and_videos] Parsed sec_user_id: {user_id} from {creator_url}")
|
||
except ValueError as e:
|
||
utils.logger.error(f"[DouYinCrawler.get_creators_and_videos] Failed to parse creator URL: {e}")
|
||
continue
|
||
|
||
creator_info: Dict = await self.dy_client.get_user_info(user_id)
|
||
if creator_info:
|
||
await douyin_store.save_creator(user_id, creator=creator_info)
|
||
|
||
# Get all video information of the creator
|
||
all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
|
||
|
||
video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
|
||
await self.batch_get_note_comments(video_ids)
|
||
|
||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||
"""
|
||
Concurrently obtain the specified post list and save the data
|
||
"""
|
||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||
task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
|
||
|
||
note_details = await asyncio.gather(*task_list)
|
||
for aweme_item in note_details:
|
||
if aweme_item is not None:
|
||
await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
|
||
await self.get_aweme_media(aweme_item=aweme_item)
|
||
|
||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
|
||
"""Create douyin client"""
|
||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||
douyin_client = DouYinClient(
|
||
proxy=httpx_proxy,
|
||
headers={
|
||
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
|
||
"Cookie": cookie_str,
|
||
"Host": "www.douyin.com",
|
||
"Origin": "https://www.douyin.com/",
|
||
"Referer": "https://www.douyin.com/",
|
||
"Content-Type": "application/json;charset=UTF-8",
|
||
},
|
||
playwright_page=self.context_page,
|
||
cookie_dict=cookie_dict,
|
||
proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh
|
||
)
|
||
return douyin_client
|
||
|
||
async def launch_browser(
|
||
self,
|
||
chromium: BrowserType,
|
||
playwright_proxy: Optional[Dict],
|
||
user_agent: Optional[str],
|
||
headless: bool = True,
|
||
) -> BrowserContext:
|
||
"""Launch browser and create browser context"""
|
||
if config.SAVE_LOGIN_STATE:
|
||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||
browser_context = await chromium.launch_persistent_context(
|
||
user_data_dir=user_data_dir,
|
||
accept_downloads=True,
|
||
headless=headless,
|
||
proxy=playwright_proxy, # type: ignore
|
||
viewport={
|
||
"width": 1920,
|
||
"height": 1080
|
||
},
|
||
user_agent=user_agent,
|
||
) # type: ignore
|
||
return browser_context
|
||
else:
|
||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||
return browser_context
|
||
|
||
async def launch_browser_with_cdp(
|
||
self,
|
||
playwright: Playwright,
|
||
playwright_proxy: Optional[Dict],
|
||
user_agent: Optional[str],
|
||
headless: bool = True,
|
||
) -> BrowserContext:
|
||
"""
|
||
使用CDP模式启动浏览器
|
||
"""
|
||
try:
|
||
self.cdp_manager = CDPBrowserManager()
|
||
browser_context = await self.cdp_manager.launch_and_connect(
|
||
playwright=playwright,
|
||
playwright_proxy=playwright_proxy,
|
||
user_agent=user_agent,
|
||
headless=headless,
|
||
)
|
||
|
||
# Add anti-detection script
|
||
await self.cdp_manager.add_stealth_script()
|
||
|
||
# Show browser information
|
||
browser_info = await self.cdp_manager.get_browser_info()
|
||
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
|
||
|
||
return browser_context
|
||
|
||
except Exception as e:
|
||
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||
# Fall back to standard mode
|
||
chromium = playwright.chromium
|
||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||
|
||
async def close(self) -> None:
|
||
"""Close browser context"""
|
||
# If you use CDP mode, special processing is required
|
||
if self.cdp_manager:
|
||
await self.cdp_manager.cleanup()
|
||
self.cdp_manager = None
|
||
else:
|
||
await self.browser_context.close()
|
||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||
|
||
async def get_aweme_media(self, aweme_item: Dict):
|
||
"""
|
||
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
|
||
|
||
Args:
|
||
aweme_item (Dict): 抖音作品详情
|
||
"""
|
||
if not config.ENABLE_GET_MEIDAS:
|
||
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
|
||
return
|
||
# List of note urls. If it is a short video type, an empty list will be returned.
|
||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||
# The video URL will always exist, but when it is a short video type, the file is actually an audio file.
|
||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||
# TODO: Douyin does not adopt the audio and video separation strategy, so the audio can be separated from the original video and will not be extracted for the time being.
|
||
if note_download_url:
|
||
await self.get_aweme_images(aweme_item)
|
||
else:
|
||
await self.get_aweme_video(aweme_item)
|
||
|
||
async def get_aweme_images(self, aweme_item: Dict):
|
||
"""
|
||
get aweme images. please use get_aweme_media
|
||
|
||
Args:
|
||
aweme_item (Dict): 抖音作品详情
|
||
"""
|
||
if not config.ENABLE_GET_MEIDAS:
|
||
return
|
||
aweme_id = aweme_item.get("aweme_id")
|
||
# List of note urls. If it is a short video type, an empty list will be returned.
|
||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||
|
||
if not note_download_url:
|
||
return
|
||
picNum = 0
|
||
for url in note_download_url:
|
||
if not url:
|
||
continue
|
||
content = await self.dy_client.get_aweme_media(url)
|
||
await asyncio.sleep(random.random())
|
||
if content is None:
|
||
continue
|
||
extension_file_name = f"{picNum:>03d}.jpeg"
|
||
picNum += 1
|
||
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
|
||
|
||
async def get_aweme_video(self, aweme_item: Dict):
|
||
"""
|
||
get aweme videos. please use get_aweme_media
|
||
|
||
Args:
|
||
aweme_item (Dict): 抖音作品详情
|
||
"""
|
||
if not config.ENABLE_GET_MEIDAS:
|
||
return
|
||
aweme_id = aweme_item.get("aweme_id")
|
||
|
||
# The video URL will always exist, but when it is a short video type, the file is actually an audio file.
|
||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||
|
||
if not video_download_url:
|
||
return
|
||
content = await self.dy_client.get_aweme_media(video_download_url)
|
||
await asyncio.sleep(random.random())
|
||
if content is None:
|
||
return
|
||
extension_file_name = f"video.mp4"
|
||
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)
|