finish_all_for_expand_bili

This commit is contained in:
Bowenwin
2025-05-22 22:26:30 +08:00
parent 59619fff0a
commit 66843f216a
12 changed files with 51 additions and 63 deletions

View File

@@ -10,16 +10,16 @@
# 基础配置
PLATFORM = "bili"
PLATFORM = "xhs"
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
LOGIN_TYPE = "phone" # qrcode or phone or cookie
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
# 具体值参见media_platform.xxx.field下的枚举值暂时只支持小红书
SORT_TYPE = "popularity_descending"
# 具体值参见media_platform.xxx.field下的枚举值暂时只支持抖音
PUBLISH_TIME_TYPE = 0
CRAWLER_TYPE = (
"creator" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
"search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
)
# 自定义User Agent暂时仅对XHS有效
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
@@ -54,9 +54,6 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
# 爬取开始页数 默认从第一页开始
START_PAGE = 1
# 爬取粉丝列表开始页数 默认从第一页开始
START_CONTACTS_PAGE = 1
# 爬取视频/帖子的数量控制
CRAWLER_MAX_NOTES_COUNT = 200
@@ -147,11 +144,7 @@ DY_CREATOR_ID_LIST = [
# 指定bili创作者ID列表(sec_id)
BILI_CREATOR_ID_LIST = [
# "20813884",
"520819684",
# "472747194",
# "519872016",
# "372201438",
"20813884",
# ........................
]
@@ -202,8 +195,15 @@ END_DAY = '2024-01-01'
# 若为 True则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
ALL_DAY = False
#!!! 下面仅支持 bilibili creator搜索
# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
CREATOR_MODE = True
# 爬取creator粉丝列表时起始爬取页数
START_CONTACTS_PAGE = 1
# 爬取作者粉丝和关注列表数量控制(单作者)
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
# 爬取作者动态粉丝和关注列表数量控制(单作者)
# 爬取作者动态数量控制(单作者)
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50

View File

@@ -12,16 +12,11 @@
import os
# mysql config
# RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
# RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
# RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
# RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306)
# RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
RELATION_DB_HOST = "47.94.233.47" # 替换为你的数据库域名/公网IP
RELATION_DB_PORT = 3306 # 替换为你的数据库端口通常3306
RELATION_DB_USER = "remote_user" # 替换为你的数据库用户名
RELATION_DB_PWD = "314159" # 替换为你的数据库密码
RELATION_DB_NAME = "Test" # 替换为你的数据库名称
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306)
RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
# redis config

View File

@@ -224,7 +224,7 @@ class BilibiliClient(AbstractApiClient):
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
callback: Optional[Callable] = None,
max_count: int = 10, ):
max_count: int = 10,):
"""
get video all comments include sub comments
:param video_id:
@@ -251,7 +251,7 @@ class BilibiliClient(AbstractApiClient):
if (comment.get("rcount", 0) > 0):
{
await self.get_video_all_level_two_comments(
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
}
if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)]
@@ -321,8 +321,7 @@ class BilibiliClient(AbstractApiClient):
result = await self.get(uri, post_data)
return result
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30,
order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
"""get all videos for a creator
:param creator_id: 创作者 ID
:param pn: 页数

View File

@@ -89,9 +89,11 @@ class BilibiliCrawler(AbstractCrawler):
# Get the information and comments of the specified post
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
# for creator_id in config.BILI_CREATOR_ID_LIST:
# await self.get_creator_videos(int(creator_id))
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
if config.CREATOR_MODE:
for creator_id in config.BILI_CREATOR_ID_LIST:
await self.get_creator_videos(int(creator_id))
else:
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
else:
pass
utils.logger.info(
@@ -119,11 +121,9 @@ class BilibiliCrawler(AbstractCrawler):
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
if start_day > end_day:
raise ValueError(
'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
elif start_day == end_day: # 搜索同一天的内容
end_day = start_day + timedelta(days=1) - timedelta(
seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
else: # 搜索 start 至 end
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
# 将其重新转换为时间戳
@@ -166,11 +166,9 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = []
try:
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
for video_item in video_list]
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
except Exception as e:
utils.logger.warning(
f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
@@ -184,23 +182,21 @@ class BilibiliCrawler(AbstractCrawler):
else:
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
# 按照每一天进行爬取的时间戳参数
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
end=day.strftime('%Y-%m-%d'))
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
page = 1
# !该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
# !除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
# !除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
#!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
#!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
#!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
# ! Catch any error if response return nothing, go to next day
#! Catch any error if response return nothing, go to next day
try:
# ! Don't skip any page, to make sure gather all video in one day
#! Don't skip any page, to make sure gather all video in one day
# if page < start_page:
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
# page += 1
# continue
utils.logger.info(
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@@ -213,9 +209,7 @@ class BilibiliCrawler(AbstractCrawler):
video_list: List[Dict] = videos_res.get("result")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
video_item in video_list]
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:

View File

@@ -38,15 +38,13 @@ def calculate_number_of_files(file_store_path: str) -> int:
if not os.path.exists(file_store_path):
return 1
try:
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
except ValueError:
return 1
class BiliCsvStoreImplement(AbstractStore):
csv_store_path: str = "data/bilibili"
file_count: int = calculate_number_of_files(csv_store_path)
file_count:int=calculate_number_of_files(csv_store_path)
def make_save_file_name(self, store_type: str) -> str:
"""
make save file name by store type
@@ -196,7 +194,7 @@ class BiliDbStoreImplement(AbstractStore):
creator["add_ts"] = utils.get_current_timestamp()
await add_new_creator(creator)
else:
await update_creator_by_creator_id(creator_id, creator_item=creator)
await update_creator_by_creator_id(creator_id,creator_item=creator)
async def store_contact(self, contact_item: Dict):
"""
@@ -249,10 +247,11 @@ class BiliJsonStoreImplement(AbstractStore):
json_store_path: str = "data/bilibili/json"
words_store_path: str = "data/bilibili/words"
lock = asyncio.Lock()
file_count: int = calculate_number_of_files(json_store_path)
file_count:int=calculate_number_of_files(json_store_path)
WordCloud = words.AsyncWordCloudGenerator()
def make_save_file_name(self, store_type: str) -> (str, str):
def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
@@ -279,7 +278,7 @@ class BiliJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:

View File

@@ -66,6 +66,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i
return effect_row
async def query_comment_by_comment_id(comment_id: str) -> Dict:
"""
查询一条评论内容

View File

@@ -238,7 +238,7 @@ class DouyinJsonStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
comment JSON storage implementation
Args:
comment_item:

View File

@@ -215,7 +215,7 @@ class KuaishouJsonStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
comment JSON storage implementation
Args:
comment_item:

View File

@@ -235,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
comment JSON storage implementation
Args:
comment_item:

View File

@@ -241,7 +241,7 @@ class WeiboJsonStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
comment JSON storage implementation
Args:
comment_item:

View File

@@ -236,7 +236,7 @@ class XhsJsonStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
comment JSON storage implementation
Args:
comment_item:

View File

@@ -235,7 +235,7 @@ class ZhihuJsonStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
comment JSON storage implementation
Args:
comment_item: