mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-06 01:47:26 +08:00
feat: 抖音支持指定视频列表爬去
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import asyncio
|
||||
import copy
|
||||
import urllib.parse
|
||||
from typing import Callable, Dict, Optional
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
import execjs
|
||||
import httpx
|
||||
@@ -129,7 +129,7 @@ class DOUYINClient:
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
|
||||
|
||||
async def get_video_by_id(self, aweme_id: str):
|
||||
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||
"""
|
||||
DouYin Video Detail API
|
||||
:param aweme_id:
|
||||
@@ -139,9 +139,10 @@ class DOUYINClient:
|
||||
"aweme_id": aweme_id
|
||||
}
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Cookie"] = "s_v_web_id=verify_leytkxgn_kvO5kOmO_SdMs_4t1o_B5ml_BUqtWM1mP6BF;"
|
||||
# headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
|
||||
del headers["Origin"]
|
||||
return await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
return res.get("aweme_detail", {})
|
||||
|
||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
||||
"""get note comments
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import asyncio
|
||||
import os
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||
async_playwright)
|
||||
@@ -11,7 +11,7 @@ from base.base_crawler import AbstractCrawler
|
||||
from base.proxy_account_pool import AccountPool
|
||||
from models import douyin
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
from var import crawler_type_var
|
||||
|
||||
from .client import DOUYINClient
|
||||
from .exception import DataFetchError
|
||||
@@ -64,20 +64,19 @@ class DouYinCrawler(AbstractCrawler):
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(self.crawler_type)
|
||||
if self.crawler_type == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif self.crawler_type == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
await self.get_specified_awemes()
|
||||
|
||||
utils.logger.info("Douyin Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
utils.logger.info("Begin search douyin keywords")
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
request_keyword_var.set(keyword)
|
||||
utils.logger.info(f"Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
dy_limit_count = 10
|
||||
@@ -101,10 +100,29 @@ class DouYinCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
async def get_specified_awemes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
# todo douyin support
|
||||
pass
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST
|
||||
]
|
||||
aweme_details = await asyncio.gather(*task_list)
|
||||
for aweme_detail in aweme_details:
|
||||
if aweme_detail is not None:
|
||||
await douyin.update_douyin_aweme(aweme_detail)
|
||||
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
|
||||
"""Get note detail"""
|
||||
async with semaphore:
|
||||
try:
|
||||
return await self.dy_client.get_video_by_id(aweme_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get aweme detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
task_list: List[Task] = []
|
||||
|
||||
@@ -74,13 +74,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
await login_obj.begin()
|
||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(self.crawler_type)
|
||||
if self.crawler_type == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
crawler_type_var.set("search")
|
||||
await self.search()
|
||||
elif self.crawler_type == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
crawler_type_var.set("detail")
|
||||
await self.get_specified_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user