mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-04-20 02:37:38 +08:00
feat: bilibili support url link
This commit is contained in:
@@ -13,16 +13,23 @@
|
|||||||
# 每天爬取视频/帖子的数量控制
|
# 每天爬取视频/帖子的数量控制
|
||||||
MAX_NOTES_PER_DAY = 1
|
MAX_NOTES_PER_DAY = 1
|
||||||
|
|
||||||
# 指定B站视频ID列表
|
# 指定B站视频URL列表 (支持完整URL或BV号)
|
||||||
|
# 示例:
|
||||||
|
# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
|
||||||
|
# - BV号: "BV1d54y1g7db"
|
||||||
BILI_SPECIFIED_ID_LIST = [
|
BILI_SPECIFIED_ID_LIST = [
|
||||||
"BV1d54y1g7db",
|
"https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
|
||||||
"BV1Sz4y1U77N",
|
"BV1Sz4y1U77N",
|
||||||
"BV14Q4y1n7jz",
|
"BV14Q4y1n7jz",
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
# 指定B站用户ID列表
|
# 指定B站创作者URL列表 (支持完整URL或UID)
|
||||||
|
# 示例:
|
||||||
|
# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
|
||||||
|
# - UID: "20813884"
|
||||||
BILI_CREATOR_ID_LIST = [
|
BILI_CREATOR_ID_LIST = [
|
||||||
|
"https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
|
||||||
"20813884",
|
"20813884",
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ from var import crawler_type_var, source_keyword_var
|
|||||||
from .client import BilibiliClient
|
from .client import BilibiliClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
from .field import SearchOrderType
|
from .field import SearchOrderType
|
||||||
|
from .help import parse_video_info_from_url, parse_creator_info_from_url
|
||||||
from .login import BilibiliLogin
|
from .login import BilibiliLogin
|
||||||
|
|
||||||
|
|
||||||
@@ -103,8 +104,14 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||||
elif config.CRAWLER_TYPE == "creator":
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
if config.CREATOR_MODE:
|
if config.CREATOR_MODE:
|
||||||
for creator_id in config.BILI_CREATOR_ID_LIST:
|
for creator_url in config.BILI_CREATOR_ID_LIST:
|
||||||
await self.get_creator_videos(int(creator_id))
|
try:
|
||||||
|
creator_info = parse_creator_info_from_url(creator_url)
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.start] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
|
||||||
|
await self.get_creator_videos(int(creator_info.creator_id))
|
||||||
|
except ValueError as e:
|
||||||
|
utils.logger.error(f"[BilibiliCrawler.start] Failed to parse creator URL: {e}")
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
|
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
|
||||||
else:
|
else:
|
||||||
@@ -362,11 +369,23 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
|
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
|
||||||
pn += 1
|
pn += 1
|
||||||
|
|
||||||
async def get_specified_videos(self, bvids_list: List[str]):
|
async def get_specified_videos(self, video_url_list: List[str]):
|
||||||
"""
|
"""
|
||||||
get specified videos info
|
get specified videos info from URLs or BV IDs
|
||||||
|
:param video_url_list: List of video URLs or BV IDs
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
utils.logger.info("[BilibiliCrawler.get_specified_videos] Parsing video URLs...")
|
||||||
|
bvids_list = []
|
||||||
|
for video_url in video_url_list:
|
||||||
|
try:
|
||||||
|
video_info = parse_video_info_from_url(video_url)
|
||||||
|
bvids_list.append(video_info.video_id)
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.get_specified_videos] Parsed video ID: {video_info.video_id} from {video_url}")
|
||||||
|
except ValueError as e:
|
||||||
|
utils.logger.error(f"[BilibiliCrawler.get_specified_videos] Failed to parse video URL: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
|
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
|
||||||
video_details = await asyncio.gather(*task_list)
|
video_details = await asyncio.gather(*task_list)
|
||||||
@@ -568,18 +587,30 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
extension_file_name = f"video.mp4"
|
extension_file_name = f"video.mp4"
|
||||||
await bilibili_store.store_video(aid, content, extension_file_name)
|
await bilibili_store.store_video(aid, content, extension_file_name)
|
||||||
|
|
||||||
async def get_all_creator_details(self, creator_id_list: List[int]):
|
async def get_all_creator_details(self, creator_url_list: List[str]):
|
||||||
"""
|
"""
|
||||||
creator_id_list: get details for creator from creator_id_list
|
creator_url_list: get details for creator from creator URL list
|
||||||
"""
|
"""
|
||||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
|
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Crawling the details of creators")
|
||||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
|
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsing creator URLs...")
|
||||||
|
|
||||||
|
creator_id_list = []
|
||||||
|
for creator_url in creator_url_list:
|
||||||
|
try:
|
||||||
|
creator_info = parse_creator_info_from_url(creator_url)
|
||||||
|
creator_id_list.append(int(creator_info.creator_id))
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
|
||||||
|
except ValueError as e:
|
||||||
|
utils.logger.error(f"[BilibiliCrawler.get_all_creator_details] Failed to parse creator URL: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] creator ids:{creator_id_list}")
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
try:
|
try:
|
||||||
for creator_id in creator_id_list:
|
for creator_id in creator_id_list:
|
||||||
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
|
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=str(creator_id))
|
||||||
task_list.append(task)
|
task_list.append(task)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||||
|
|||||||
@@ -9,15 +9,17 @@
|
|||||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# @Author : relakkes@gmail.com
|
# @Author : relakkes@gmail.com
|
||||||
# @Time : 2023/12/2 23:26
|
# @Time : 2023/12/2 23:26
|
||||||
# @Desc : bilibili 请求参数签名
|
# @Desc : bilibili 请求参数签名
|
||||||
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||||
|
import re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
from model.m_bilibili import VideoUrlInfo, CreatorUrlInfo
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
@@ -66,16 +68,71 @@ class BilibiliSign:
|
|||||||
return req_data
|
return req_data
|
||||||
|
|
||||||
|
|
||||||
|
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||||
|
"""
|
||||||
|
从B站视频URL中解析出视频ID
|
||||||
|
Args:
|
||||||
|
url: B站视频链接
|
||||||
|
- https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
|
||||||
|
- https://www.bilibili.com/video/BV1d54y1g7db
|
||||||
|
- BV1d54y1g7db (直接传入BV号)
|
||||||
|
Returns:
|
||||||
|
VideoUrlInfo: 包含视频ID的对象
|
||||||
|
"""
|
||||||
|
# 如果传入的已经是BV号,直接返回
|
||||||
|
if url.startswith("BV"):
|
||||||
|
return VideoUrlInfo(video_id=url)
|
||||||
|
|
||||||
|
# 使用正则表达式提取BV号
|
||||||
|
# 匹配 /video/BV... 或 /video/av... 格式
|
||||||
|
bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
|
||||||
|
match = re.search(bv_pattern, url)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
video_id = match.group(1)
|
||||||
|
return VideoUrlInfo(video_id=video_id)
|
||||||
|
|
||||||
|
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||||
|
"""
|
||||||
|
从B站创作者空间URL中解析出创作者ID
|
||||||
|
Args:
|
||||||
|
url: B站创作者空间链接
|
||||||
|
- https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
|
||||||
|
- https://space.bilibili.com/20813884
|
||||||
|
- 434377496 (直接传入UID)
|
||||||
|
Returns:
|
||||||
|
CreatorUrlInfo: 包含创作者ID的对象
|
||||||
|
"""
|
||||||
|
# 如果传入的已经是纯数字ID,直接返回
|
||||||
|
if url.isdigit():
|
||||||
|
return CreatorUrlInfo(creator_id=url)
|
||||||
|
|
||||||
|
# 使用正则表达式提取UID
|
||||||
|
# 匹配 /space.bilibili.com/数字 格式
|
||||||
|
uid_pattern = r'space\.bilibili\.com/(\d+)'
|
||||||
|
match = re.search(uid_pattern, url)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
creator_id = match.group(1)
|
||||||
|
return CreatorUrlInfo(creator_id=creator_id)
|
||||||
|
|
||||||
|
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
_img_key = "7cd084941338484aae1ad9425b84077c"
|
# 测试视频URL解析
|
||||||
_sub_key = "4932caff0ff746eab6f01bf08b70ac45"
|
video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
|
||||||
_search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
|
video_url2 = "BV1d54y1g7db"
|
||||||
_req_data = dict()
|
print("视频URL解析测试:")
|
||||||
for params in _search_url.split("&"):
|
print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
|
||||||
kvalues = params.split("=")
|
print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
|
||||||
key = kvalues[0]
|
|
||||||
value = kvalues[1]
|
# 测试创作者URL解析
|
||||||
_req_data[key] = value
|
creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
|
||||||
print("pre req_data", _req_data)
|
creator_url2 = "20813884"
|
||||||
_req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
|
print("\n创作者URL解析测试:")
|
||||||
print(_req_data)
|
print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
|
||||||
|
print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")
|
||||||
|
|||||||
25
model/m_bilibili.py
Normal file
25
model/m_bilibili.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||||
|
# 1. 不得用于任何商业用途。
|
||||||
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||||
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||||
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||||
|
# 5. 不得用于任何非法或不当的用途。
|
||||||
|
#
|
||||||
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||||
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||||
|
|
||||||
|
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class VideoUrlInfo(BaseModel):
|
||||||
|
"""B站视频URL信息"""
|
||||||
|
video_id: str = Field(title="video id (BV id)")
|
||||||
|
video_type: str = Field(default="video", title="video type")
|
||||||
|
|
||||||
|
|
||||||
|
class CreatorUrlInfo(BaseModel):
|
||||||
|
"""B站创作者URL信息"""
|
||||||
|
creator_id: str = Field(title="creator id (UID)")
|
||||||
Reference in New Issue
Block a user