feat(core): 新增控制爬虫 参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

This commit is contained in:
Tianci-King
2024-04-12 00:52:47 +08:00
parent bba9841c26
commit 1115b0d90c
9 changed files with 94 additions and 40 deletions

View File

@@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler):
self.index_url = "https://www.bilibili.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str):
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
@@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler):
bili_limit_count =20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
for keyword in config.KEYWORDS.split(","):
start_page = self.start_page # start page number
for keyword in self.keyword.split(","):
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
page += 1
continue
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,