refactor: config update

This commit is contained in:
程序员阿江(Relakkes)
2025-07-18 23:26:52 +08:00
parent 122978b35c
commit 13b00f7a36
17 changed files with 964 additions and 485 deletions

View File

@@ -1,34 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from config import *
# 每天爬取视频/帖子的数量控制
MAX_NOTES_PER_DAY = 1
# Bilibili 平台配置
BILI_SPECIFIED_ID_LIST = [
"BV1d54y1g7db",
"BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................
]
START_DAY = "2024-01-01"
END_DAY = "2024-01-01"
BILI_SEARCH_MODE = "normal"
CREATOR_MODE = True
START_CONTACTS_PAGE = 1
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50
BILI_CREATOR_ID_LIST = [
"20813884",
# ........................
]

View File

@@ -1,12 +1,12 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
@@ -22,10 +22,16 @@ from typing import Dict, List, Optional, Tuple, Union
from datetime import datetime, timedelta
import pandas as pd
from playwright.async_api import (BrowserContext, BrowserType, Page, Playwright, async_playwright)
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
from playwright._impl._errors import TargetClosedError
from . import config
import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import bilibili as bilibili_store
@@ -53,28 +59,30 @@ class BilibiliCrawler(AbstractCrawler):
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_pool = await create_ip_pool(
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(
ip_proxy_info)
ip_proxy_info
)
async with async_playwright() as playwright:
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright, playwright_proxy_format, self.user_agent,
headless=config.CDP_HEADLESS
playwright,
playwright_proxy_format,
self.user_agent,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium,
None,
self.user_agent,
headless=config.HEADLESS
chromium, None, self.user_agent, headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
@@ -89,10 +97,12 @@ class BilibiliCrawler(AbstractCrawler):
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
cookie_str=config.COOKIES,
)
await login_obj.begin()
await self.bili_client.update_cookies(browser_context=self.browser_context)
await self.bili_client.update_cookies(
browser_context=self.browser_context
)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
@@ -108,8 +118,7 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
else:
pass
utils.logger.info(
"[BilibiliCrawler.start] Bilibili Crawler finished ...")
utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...")
async def search(self):
"""
@@ -126,13 +135,15 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
@staticmethod
async def get_pubtime_datetime(start: str = config.START_DAY, end: str = config.END_DAY) -> Tuple[str, str]:
async def get_pubtime_datetime(
start: str = config.START_DAY, end: str = config.END_DAY
) -> Tuple[str, str]:
"""
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
---
:param start: 发布日期起始时间YYYY-MM-DD
:param end: 发布日期结束时间YYYY-MM-DD
Note
---
- 搜索的时间范围为 start 至 end包含 start 和 end
@@ -144,14 +155,20 @@ class BilibiliCrawler(AbstractCrawler):
转换为可读的 datetime 对象pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0)pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
"""
# 转换 start 与 end 为 datetime 对象
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
if start_day > end_day:
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
raise ValueError(
"Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end"
)
elif start_day == end_day: # 搜索同一天的内容
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
end_day = (
start_day + timedelta(days=1) - timedelta(seconds=1)
) # 则将 end_day 设置为 start_day + 1 day - 1 second
else: # 搜索 start 至 end
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
end_day = (
end_day + timedelta(days=1) - timedelta(seconds=1)
) # 则将 end_day 设置为 end_day + 1 day - 1 second
# 将其重新转换为时间戳
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
@@ -160,22 +177,32 @@ class BilibiliCrawler(AbstractCrawler):
search bilibili video with keywords in normal mode
:return:
"""
utils.logger.info("[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords")
utils.logger.info(
"[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords"
)
bili_limit_count = 20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}")
utils.logger.info(
f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}"
)
page = 1
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
while (
page - start_page + 1
) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Skip page: {page}")
utils.logger.info(
f"[BilibiliCrawler.search_by_keywords] Skip page: {page}"
)
page += 1
continue
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}")
utils.logger.info(
f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}"
)
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@@ -183,20 +210,29 @@ class BilibiliCrawler(AbstractCrawler):
page_size=bili_limit_count,
order=SearchOrderType.DEFAULT,
pubtime_begin_s=0, # 作品发布日期起始时间戳
pubtime_end_s=0 # 作品发布日期结束日期时间戳
pubtime_end_s=0, # 作品发布日期结束日期时间戳
)
video_list: List[Dict] = videos_res.get("result")
if not video_list:
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword.")
utils.logger.info(
f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword."
)
break
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = []
try:
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
task_list = [
self.get_video_info_task(
aid=video_item.get("aid"), bvid="", semaphore=semaphore
)
for video_item in video_list
]
except Exception as e:
utils.logger.warning(f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}")
utils.logger.warning(
f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}"
)
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
@@ -212,40 +248,74 @@ class BilibiliCrawler(AbstractCrawler):
Search bilibili video with keywords in a given time range.
:param daily_limit: if True, strictly limit the number of notes per day and total.
"""
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}")
utils.logger.info(
f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}"
)
bili_limit_count = 20
start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
utils.logger.info(
f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}"
)
total_notes_crawled_for_keyword = 0
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
for day in pd.date_range(
start=config.START_DAY, end=config.END_DAY, freq="D"
):
if (
daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
utils.logger.info(
f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days."
)
break
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
if (
not daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
utils.logger.info(
f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days."
)
break
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(
start=day.strftime("%Y-%m-%d"), end=day.strftime("%Y-%m-%d")
)
page = 1
notes_count_this_day = 0
while True:
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
utils.logger.info(
f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}."
)
break
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
if (
daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
utils.logger.info(
f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'."
)
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
if (
not daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
break
try:
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
utils.logger.info(
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}"
)
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@@ -253,23 +323,38 @@ class BilibiliCrawler(AbstractCrawler):
page_size=bili_limit_count,
order=SearchOrderType.DEFAULT,
pubtime_begin_s=pubtime_begin_s,
pubtime_end_s=pubtime_end_s
pubtime_end_s=pubtime_end_s,
)
video_list: List[Dict] = videos_res.get("result")
if not video_list:
utils.logger.info(f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day.")
utils.logger.info(
f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day."
)
break
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
task_list = [
self.get_video_info_task(
aid=video_item.get("aid"), bvid="", semaphore=semaphore
)
for video_item in video_list
]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
if daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
if (
daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
break
if not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT:
if (
not daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
break
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
break
@@ -284,7 +369,9 @@ class BilibiliCrawler(AbstractCrawler):
await self.batch_get_video_comments(video_id_list)
except Exception as e:
utils.logger.error(f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}")
utils.logger.error(
f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}"
)
break
async def batch_get_video_comments(self, video_id_list: List[str]):
@@ -295,16 +382,19 @@ class BilibiliCrawler(AbstractCrawler):
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(
f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled"
)
return
utils.logger.info(
f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for video_id in video_id_list:
task = asyncio.create_task(self.get_comments(
video_id, semaphore), name=video_id)
task = asyncio.create_task(
self.get_comments(video_id, semaphore), name=video_id
)
task_list.append(task)
await asyncio.gather(*task_list)
@@ -318,7 +408,8 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ..."
)
await asyncio.sleep(random.uniform(0.5, 1.5))
await self.bili_client.get_video_all_comments(
video_id=video_id,
@@ -330,10 +421,12 @@ class BilibiliCrawler(AbstractCrawler):
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}"
)
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}"
)
# Propagate the exception to be caught by the main loop
raise
@@ -360,8 +453,8 @@ class BilibiliCrawler(AbstractCrawler):
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
bvids_list
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore)
for video_id in bvids_list
]
video_details = await asyncio.gather(*task_list)
video_aids_list = []
@@ -376,7 +469,9 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_bilibili_video(video_detail, semaphore)
await self.batch_get_video_comments(video_aids_list)
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
async def get_video_info_task(
self, aid: int, bvid: str, semaphore: asyncio.Semaphore
) -> Optional[Dict]:
"""
Get video detail task
:param aid:
@@ -390,43 +485,54 @@ class BilibiliCrawler(AbstractCrawler):
return result
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}"
)
return None
except KeyError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}"
)
return None
async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]:
async def get_video_play_url_task(
self, aid: int, cid: int, semaphore: asyncio.Semaphore
) -> Union[Dict, None]:
"""
Get video play url
:param aid:
:param cid:
:param semaphore:
:return:
"""
Get video play url
:param aid:
:param cid:
:param semaphore:
:return:
"""
async with semaphore:
try:
result = await self.bili_client.get_video_play_url(aid=aid, cid=cid)
return result
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}")
f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}"
)
return None
except KeyError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}")
f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}"
)
return None
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
async def create_bilibili_client(
self, httpx_proxy: Optional[str]
) -> BilibiliClient:
"""
create bilibili client
:param httpx_proxy: httpx proxy
:return: bilibili client
"""
utils.logger.info(
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ..."
)
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
bilibili_client_obj = BilibiliClient(
proxies=httpx_proxy,
headers={
@@ -434,7 +540,7 @@ class BilibiliCrawler(AbstractCrawler):
"Cookie": cookie_str,
"Origin": "https://www.bilibili.com",
"Referer": "https://www.bilibili.com",
"Content-Type": "application/json;charset=UTF-8"
"Content-Type": "application/json;charset=UTF-8",
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
@@ -442,7 +548,9 @@ class BilibiliCrawler(AbstractCrawler):
return bilibili_client_obj
@staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
def format_proxy_info(
ip_proxy_info: IpInfoModel,
) -> Tuple[Optional[Dict], Optional[Dict]]:
"""
format proxy info for playwright and httpx
:param ip_proxy_info: ip proxy info
@@ -459,13 +567,13 @@ class BilibiliCrawler(AbstractCrawler):
return playwright_proxy, httpx_proxy
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
"""
launch browser and create browser context
:param chromium: chromium browser
:param playwright_proxy: playwright proxy
@@ -474,32 +582,38 @@ class BilibiliCrawler(AbstractCrawler):
:return: browser context
"""
utils.logger.info(
"[BilibiliCrawler.launch_browser] Begin create browser context ...")
"[BilibiliCrawler.launch_browser] Begin create browser context ..."
)
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
user_data_dir = os.path.join(
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
user_agent=user_agent,
)
return browser_context
else:
# type: ignore
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
)
return browser_context
async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict],
user_agent: Optional[str], headless: bool = True) -> BrowserContext:
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
@@ -509,7 +623,7 @@ class BilibiliCrawler(AbstractCrawler):
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless
headless=headless,
)
# 显示浏览器信息
@@ -519,10 +633,14 @@ class BilibiliCrawler(AbstractCrawler):
return browser_context
except Exception as e:
utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败回退到标准模式: {e}")
utils.logger.error(
f"[BilibiliCrawler] CDP模式启动失败回退到标准模式: {e}"
)
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
return await self.launch_browser(
chromium, playwright_proxy, user_agent, headless
)
async def close(self):
"""Close browser context"""
@@ -535,9 +653,13 @@ class BilibiliCrawler(AbstractCrawler):
await self.browser_context.close()
utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
except TargetClosedError:
utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.")
utils.logger.warning(
"[BilibiliCrawler.close] Browser context was already closed."
)
except Exception as e:
utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}")
utils.logger.error(
f"[BilibiliCrawler.close] An error occurred during close: {e}"
)
async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
"""
@@ -547,14 +669,18 @@ class BilibiliCrawler(AbstractCrawler):
:return:
"""
if not config.ENABLE_GET_IMAGES:
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled")
utils.logger.info(
f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled"
)
return
video_item_view: Dict = video_item.get("View")
aid = video_item_view.get("aid")
cid = video_item_view.get("cid")
result = await self.get_video_play_url_task(aid, cid, semaphore)
if result is None:
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed")
utils.logger.info(
"[BilibiliCrawler.get_bilibili_video] get video play url failed"
)
return
durl_list = result.get("durl")
max_size = -1
@@ -565,7 +691,9 @@ class BilibiliCrawler(AbstractCrawler):
max_size = size
video_url = durl.get("url")
if video_url == "":
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed")
utils.logger.info(
"[BilibiliCrawler.get_bilibili_video] get video url failed"
)
return
content = await self.bili_client.get_video_media(video_url)
@@ -579,20 +707,24 @@ class BilibiliCrawler(AbstractCrawler):
creator_id_list: get details for creator from creator_id_list
"""
utils.logger.info(
f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator"
)
utils.logger.info(
f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
try:
for creator_id in creator_id_list:
task = asyncio.create_task(self.get_creator_details(
creator_id, semaphore), name=creator_id)
task = asyncio.create_task(
self.get_creator_details(creator_id, semaphore), name=creator_id
)
task_list.append(task)
except Exception as e:
utils.logger.warning(
f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}"
)
await asyncio.gather(*task_list)
@@ -604,7 +736,9 @@ class BilibiliCrawler(AbstractCrawler):
:return:
"""
async with semaphore:
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(
creator_id
)
creator_info: Dict = {
"id": creator_id,
"name": creator_unhandled_info.get("name"),
@@ -626,7 +760,8 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ..."
)
await self.bili_client.get_creator_all_fans(
creator_info=creator_info,
crawl_interval=random.random(),
@@ -636,10 +771,12 @@ class BilibiliCrawler(AbstractCrawler):
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}"
)
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}"
)
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
"""
@@ -652,7 +789,8 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ..."
)
await self.bili_client.get_creator_all_followings(
creator_info=creator_info,
crawl_interval=random.random(),
@@ -662,10 +800,12 @@ class BilibiliCrawler(AbstractCrawler):
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}"
)
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}"
)
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
"""
@@ -678,7 +818,8 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(
f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ..."
)
await self.bili_client.get_creator_all_dynamics(
creator_info=creator_info,
crawl_interval=random.random(),
@@ -688,7 +829,9 @@ class BilibiliCrawler(AbstractCrawler):
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}"
)
except Exception as e:
utils.logger.error(
f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}"
)