diff --git a/config/bilibili_config.py b/config/bilibili_config.py index 7882824..779ab75 100644 --- a/config/bilibili_config.py +++ b/config/bilibili_config.py @@ -13,16 +13,23 @@ # 每天爬取视频/帖子的数量控制 MAX_NOTES_PER_DAY = 1 -# 指定B站视频ID列表 +# 指定B站视频URL列表 (支持完整URL或BV号) +# 示例: +# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click" +# - BV号: "BV1d54y1g7db" BILI_SPECIFIED_ID_LIST = [ - "BV1d54y1g7db", + "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click", "BV1Sz4y1U77N", "BV14Q4y1n7jz", # ........................ ] -# 指定B站用户ID列表 +# 指定B站创作者URL列表 (支持完整URL或UID) +# 示例: +# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0" +# - UID: "20813884" BILI_CREATOR_ID_LIST = [ + "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0", "20813884", # ........................ ] diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 1c9c175..39af14a 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -41,6 +41,7 @@ from var import crawler_type_var, source_keyword_var from .client import BilibiliClient from .exception import DataFetchError from .field import SearchOrderType +from .help import parse_video_info_from_url, parse_creator_info_from_url from .login import BilibiliLogin @@ -103,8 +104,14 @@ class BilibiliCrawler(AbstractCrawler): await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST) elif config.CRAWLER_TYPE == "creator": if config.CREATOR_MODE: - for creator_id in config.BILI_CREATOR_ID_LIST: - await self.get_creator_videos(int(creator_id)) + for creator_url in config.BILI_CREATOR_ID_LIST: + try: + creator_info = parse_creator_info_from_url(creator_url) + utils.logger.info(f"[BilibiliCrawler.start] Parsed creator ID: {creator_info.creator_id} from {creator_url}") + await self.get_creator_videos(int(creator_info.creator_id)) + except ValueError as e: + utils.logger.error(f"[BilibiliCrawler.start] Failed to parse creator URL: {e}") + continue else: await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST) else: @@ -362,11 +369,23 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}") pn += 1 - async def get_specified_videos(self, bvids_list: List[str]): + async def get_specified_videos(self, video_url_list: List[str]): """ - get specified videos info + get specified videos info from URLs or BV IDs + :param video_url_list: List of video URLs or BV IDs :return: """ + utils.logger.info("[BilibiliCrawler.get_specified_videos] Parsing video URLs...") + bvids_list = [] + for video_url in video_url_list: + try: + video_info = parse_video_info_from_url(video_url) + bvids_list.append(video_info.video_id) + utils.logger.info(f"[BilibiliCrawler.get_specified_videos] Parsed video ID: {video_info.video_id} from {video_url}") + except ValueError as e: + utils.logger.error(f"[BilibiliCrawler.get_specified_videos] Failed to parse video URL: {e}") + continue + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list] video_details = await asyncio.gather(*task_list) @@ -568,18 +587,30 @@ class BilibiliCrawler(AbstractCrawler): extension_file_name = f"video.mp4" await bilibili_store.store_video(aid, content, extension_file_name) - async def get_all_creator_details(self, creator_id_list: List[int]): + async def get_all_creator_details(self, creator_url_list: List[str]): """ - creator_id_list: get details for creator from creator_id_list + creator_url_list: get details for creator from creator URL list """ - utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator") - utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}") + utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Crawling the details of creators") + utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsing creator URLs...") + + creator_id_list = [] + for creator_url in creator_url_list: + try: + creator_info = parse_creator_info_from_url(creator_url) + creator_id_list.append(int(creator_info.creator_id)) + utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsed creator ID: {creator_info.creator_id} from {creator_url}") + except ValueError as e: + utils.logger.error(f"[BilibiliCrawler.get_all_creator_details] Failed to parse creator URL: {e}") + continue + + utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] creator ids:{creator_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] try: for creator_id in creator_id_list: - task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id) + task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=str(creator_id)) task_list.append(task) except Exception as e: utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}") diff --git a/media_platform/bilibili/help.py b/media_platform/bilibili/help.py index b4e6221..614117a 100644 --- a/media_platform/bilibili/help.py +++ b/media_platform/bilibili/help.py @@ -9,15 +9,17 @@ # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 - # -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 23:26 # @Desc : bilibili 请求参数签名 # 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95 +import re import urllib.parse from hashlib import md5 from typing import Dict +from model.m_bilibili import VideoUrlInfo, CreatorUrlInfo from tools import utils @@ -66,16 +68,71 @@ class BilibiliSign: return req_data +def parse_video_info_from_url(url: str) -> VideoUrlInfo: + """ + 从B站视频URL中解析出视频ID + Args: + url: B站视频链接 + - https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click + - https://www.bilibili.com/video/BV1d54y1g7db + - BV1d54y1g7db (直接传入BV号) + Returns: + VideoUrlInfo: 包含视频ID的对象 + """ + # 如果传入的已经是BV号,直接返回 + if url.startswith("BV"): + return VideoUrlInfo(video_id=url) + + # 使用正则表达式提取BV号 + # 匹配 /video/BV... 或 /video/av... 格式 + bv_pattern = r'/video/(BV[a-zA-Z0-9]+)' + match = re.search(bv_pattern, url) + + if match: + video_id = match.group(1) + return VideoUrlInfo(video_id=video_id) + + raise ValueError(f"无法从URL中解析出视频ID: {url}") + + +def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: + """ + 从B站创作者空间URL中解析出创作者ID + Args: + url: B站创作者空间链接 + - https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0 + - https://space.bilibili.com/20813884 + - 434377496 (直接传入UID) + Returns: + CreatorUrlInfo: 包含创作者ID的对象 + """ + # 如果传入的已经是纯数字ID,直接返回 + if url.isdigit(): + return CreatorUrlInfo(creator_id=url) + + # 使用正则表达式提取UID + # 匹配 /space.bilibili.com/数字 格式 + uid_pattern = r'space\.bilibili\.com/(\d+)' + match = re.search(uid_pattern, url) + + if match: + creator_id = match.group(1) + return CreatorUrlInfo(creator_id=creator_id) + + raise ValueError(f"无法从URL中解析出创作者ID: {url}") + + if __name__ == '__main__': - _img_key = "7cd084941338484aae1ad9425b84077c" - _sub_key = "4932caff0ff746eab6f01bf08b70ac45" - _search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654" - _req_data = dict() - for params in _search_url.split("&"): - kvalues = params.split("=") - key = kvalues[0] - value = kvalues[1] - _req_data[key] = value - print("pre req_data", _req_data) - _req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001}) - print(_req_data) + # 测试视频URL解析 + video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click" + video_url2 = "BV1d54y1g7db" + print("视频URL解析测试:") + print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}") + print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}") + + # 测试创作者URL解析 + creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0" + creator_url2 = "20813884" + print("\n创作者URL解析测试:") + print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}") + print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}") diff --git a/model/m_bilibili.py b/model/m_bilibili.py new file mode 100644 index 0000000..d095add --- /dev/null +++ b/model/m_bilibili.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- + +from pydantic import BaseModel, Field + + +class VideoUrlInfo(BaseModel): + """B站视频URL信息""" + video_id: str = Field(title="video id (BV id)") + video_type: str = Field(default="video", title="video type") + + +class CreatorUrlInfo(BaseModel): + """B站创作者URL信息""" + creator_id: str = Field(title="creator id (UID)")