mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 03:17:25 +08:00
refactor: 规范日志打印
feat: B站指定视频ID爬取(bvid)
This commit is contained in:
@@ -83,14 +83,14 @@ class XHSClient:
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("Begin to pong xhs...")
|
||||
utils.logger.info("[XHSClient.pong] Begin to pong xhs...")
|
||||
ping_flag = False
|
||||
try:
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||
if note_card.get("items"):
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"Ping xhs failed: {e}, and try to login again...")
|
||||
utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
@@ -136,7 +136,7 @@ class XHSClient:
|
||||
if res and res.get("items"):
|
||||
res_dict: Dict = res["items"][0]["note_card"]
|
||||
return res_dict
|
||||
utils.logger.error(f"[xhs.client.get_note_by_id] get note empty and res:{res}")
|
||||
utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}")
|
||||
return dict()
|
||||
|
||||
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
|
||||
@@ -195,7 +195,7 @@ class XHSClient:
|
||||
# Handle the absence of 'comments' key appropriately
|
||||
# For example, log an error message, break from the loop, etc.
|
||||
# This is just an example:
|
||||
print(f"No 'comments' key found in response: {comments_res}")
|
||||
utils.logger.info(f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||
break
|
||||
comments = comments_res["comments"]
|
||||
if not is_fetch_sub_comments:
|
||||
|
||||
@@ -87,14 +87,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("Xhs Crawler finished ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("Begin search xiaohongshu keywords")
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"Current search keyword: {keyword}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_id_list: List[str] = []
|
||||
@@ -102,7 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
)
|
||||
utils.logger.info(f"Search notes res:{notes_res}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail(post_item.get("id"), semaphore)
|
||||
@@ -115,7 +115,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await xhs_model.update_xhs_note(note_detail)
|
||||
note_id_list.append(note_detail.get("note_id"))
|
||||
page += 1
|
||||
utils.logger.info(f"Note details: {note_details}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
@@ -136,15 +136,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
try:
|
||||
return await self.xhs_client.get_note_by_id(note_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get note detail error: {ex}")
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_list: List[str]):
|
||||
"""Batch get note comments"""
|
||||
utils.logger.info(f"Begin batch get note comments, note list: {note_list}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_id in note_list:
|
||||
@@ -155,7 +155,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
||||
"""Get note comments with keyword filtering and quantity limitation"""
|
||||
async with semaphore:
|
||||
utils.logger.info(f"Begin get note id comments {note_id}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
|
||||
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
|
||||
|
||||
# 从配置文件中读取关键词和数量限制
|
||||
@@ -191,7 +191,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("Begin create xiaohongshu API client ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = XHSClient(
|
||||
proxies=httpx_proxy,
|
||||
@@ -215,7 +215,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
headless: bool = True
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("Begin create browser context ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
@@ -241,4 +241,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("Browser context closed ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
|
||||
@@ -37,7 +37,7 @@ class XHSLogin(AbstractLogin):
|
||||
"""
|
||||
|
||||
if "请通过验证" in await self.context_page.content():
|
||||
utils.logger.info("登录过程中出现验证码,请手动验证")
|
||||
utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
@@ -48,7 +48,7 @@ class XHSLogin(AbstractLogin):
|
||||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("Begin login xiaohongshu ...")
|
||||
utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...")
|
||||
if self.login_type == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif self.login_type == "phone":
|
||||
@@ -56,11 +56,11 @@ class XHSLogin(AbstractLogin):
|
||||
elif self.login_type == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login xiaohongshu by mobile"""
|
||||
utils.logger.info("Begin login xiaohongshu by mobile ...")
|
||||
utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||||
@@ -77,7 +77,7 @@ class XHSLogin(AbstractLogin):
|
||||
)
|
||||
await element.click()
|
||||
except Exception as e:
|
||||
utils.logger.info("have not found mobile button icon and keep going ...")
|
||||
utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||
@@ -93,7 +93,7 @@ class XHSLogin(AbstractLogin):
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
no_logged_in_session = ""
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"xhs_{self.login_phone}"
|
||||
sms_code_value = redis_obj.get(sms_code_key)
|
||||
@@ -119,16 +119,16 @@ class XHSLogin(AbstractLogin):
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("Login xiaohongshu failed by mobile login method ...")
|
||||
utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login xiaohongshu website and keep webdriver login state"""
|
||||
utils.logger.info("Begin login xiaohongshu by qrcode ...")
|
||||
utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
||||
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
||||
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||
# find login qrcode
|
||||
@@ -137,7 +137,7 @@ class XHSLogin(AbstractLogin):
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("login failed , have not found qrcode please check ....")
|
||||
utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||
@@ -161,20 +161,20 @@ class XHSLogin(AbstractLogin):
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"waiting for scan code login, remaining time is 120s")
|
||||
utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("Login xiaohongshu failed by qrcode login method ...")
|
||||
utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login xiaohongshu website by cookies"""
|
||||
utils.logger.info("Begin login xiaohongshu by cookie ...")
|
||||
utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
if key != "web_session": # only set web_session cookie attr
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user