feature: 支持小红书图片、视频下载

This commit is contained in:
helloteemo
2024-07-11 22:49:05 +08:00
parent 1a776ee968
commit 6545a15ff3
5 changed files with 153 additions and 6 deletions

View File

@@ -120,6 +120,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
note_id_list.append(note_detail.get("note_id"))
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
@@ -171,6 +172,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -276,4 +278,63 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
async def get_notice_media(self, note_detail: Dict):
if not config.ENABLE_GET_IMAGES:
utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
return
await self.get_note_images(note_detail)
await self.get_notice_video(note_detail)
async def get_note_images(self, note_item: Dict):
"""
get note images. please use get_notice_media
:param note_item:
:return:
"""
if not config.ENABLE_GET_IMAGES:
return
note_id = note_item.get("note_id")
image_list: List[Dict] = note_item.get("image_list", [])
for img in image_list:
if img.get('url_default') != '':
img.update({'url': img.get('url_default')})
if not image_list:
return
picNum = 0
for pic in image_list:
url = pic.get("url")
if not url:
continue
content = await self.xhs_client.get_note_media(url)
if content is None:
continue
extension_file_name = f"{picNum}.jpg"
picNum += 1
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
async def get_notice_video(self, note_item: Dict):
"""
get note images. please use get_notice_media
:param note_item:
:return:
"""
if not config.ENABLE_GET_IMAGES:
return
note_id = note_item.get("note_id")
videos = xhs_store.get_video_url_arr(note_item)
if not videos:
return
videoNum = 0
for url in videos:
content = await self.xhs_client.get_note_media(url)
if content is None:
continue
extension_file_name = f"{videoNum}.mp4"
videoNum += 1
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)