feature: 支持小红书图片、视频下载

This commit is contained in:
helloteemo
2024-07-11 22:49:05 +08:00
parent 1a776ee968
commit 6545a15ff3
5 changed files with 153 additions and 6 deletions

View File

@@ -129,6 +129,15 @@ class XiaoHongShuClient(AbstractApiClient):
return await self.request(method="POST", url=f"{self._host}{uri}",
data=json_str, headers=headers)
async def get_note_media(self, url: str) -> bytes | None:
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request("GET", url, timeout=self.timeout)
if not response.reason_phrase == "OK":
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
return None
else:
return response.content
async def pong(self) -> bool:
"""
用于检查登录态是否失效了

View File

@@ -120,6 +120,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
note_id_list.append(note_detail.get("note_id"))
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
@@ -171,6 +172,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -276,4 +278,63 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
async def get_notice_media(self, note_detail: Dict):
if not config.ENABLE_GET_IMAGES:
utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
return
await self.get_note_images(note_detail)
await self.get_notice_video(note_detail)
async def get_note_images(self, note_item: Dict):
"""
get note images. please use get_notice_media
:param note_item:
:return:
"""
if not config.ENABLE_GET_IMAGES:
return
note_id = note_item.get("note_id")
image_list: List[Dict] = note_item.get("image_list", [])
for img in image_list:
if img.get('url_default') != '':
img.update({'url': img.get('url_default')})
if not image_list:
return
picNum = 0
for pic in image_list:
url = pic.get("url")
if not url:
continue
content = await self.xhs_client.get_note_media(url)
if content is None:
continue
extension_file_name = f"{picNum}.jpg"
picNum += 1
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
async def get_notice_video(self, note_item: Dict):
"""
get note images. please use get_notice_media
:param note_item:
:return:
"""
if not config.ENABLE_GET_IMAGES:
return
note_id = note_item.get("note_id")
videos = xhs_store.get_video_url_arr(note_item)
if not videos:
return
videoNum = 0
for url in videos:
content = await self.xhs_client.get_note_media(url)
if content is None:
continue
extension_file_name = f"{videoNum}.mp4"
videoNum += 1
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)