mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-04-21 11:17:38 +08:00
feat: 小红书笔记搜索,评论获取done
docs: update docs Create .gitattributes Update README.md
This commit is contained in:
1
media_platform/douyin/__init__.py
Normal file
1
media_platform/douyin/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .core import DouYinCrawler
|
||||
42
media_platform/douyin/client.py
Normal file
42
media_platform/douyin/client.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from typing import Optional, Dict
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import Page
|
||||
|
||||
|
||||
class DOUYINClient:
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
proxies=None,
|
||||
headers: Optional[Dict] = None,
|
||||
playwright_page: Page = None,
|
||||
cookie_dict: Dict = None
|
||||
):
|
||||
self.proxies = proxies
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.douyin.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def _pre_params(self, url: str, data=None):
|
||||
pass
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||
response = await client.request(
|
||||
method, url, timeout=self.timeout,
|
||||
**kwargs
|
||||
)
|
||||
data = response.json()
|
||||
if data["success"]:
|
||||
return data.get("data", data.get("success"))
|
||||
else:
|
||||
pass
|
||||
|
||||
async def get(self, uri: str, params=None):
|
||||
pass
|
||||
|
||||
async def post(self, uri: str, data: dict):
|
||||
pass
|
||||
61
media_platform/douyin/core.py
Normal file
61
media_platform/douyin/core.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import sys
|
||||
import asyncio
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright.async_api import Page
|
||||
from playwright.async_api import Cookie
|
||||
from playwright.async_api import BrowserContext
|
||||
|
||||
import utils
|
||||
from .client import DOUYINClient
|
||||
from base_crawler import Crawler
|
||||
|
||||
|
||||
class DouYinCrawler(Crawler):
|
||||
def __init__(self):
|
||||
self.keywords: Optional[str] = None
|
||||
self.scan_qrcode_time: Optional[int] = None
|
||||
self.cookies: Optional[List[Cookie]] = None
|
||||
self.browser_context: Optional[BrowserContext] = None
|
||||
self.context_page: Optional[Page] = None
|
||||
self.proxy: Optional[Dict] = None
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.dy_client: Optional[DOUYINClient] = None
|
||||
|
||||
def init_config(self, **kwargs):
|
||||
self.keywords = kwargs.get("keywords")
|
||||
self.scan_qrcode_time = kwargs.get("scan_qrcode_time")
|
||||
|
||||
async def start(self):
|
||||
async with async_playwright() as playwright:
|
||||
chromium = playwright.chromium
|
||||
browser = await chromium.launch(headless=False)
|
||||
self.browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=self.user_agent,
|
||||
proxy=self.proxy
|
||||
)
|
||||
# execute JS to bypass anti automation/crawler detection
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto("https://www.douyin.com")
|
||||
|
||||
# scan qrcode login
|
||||
await self.login()
|
||||
await self.update_cookies()
|
||||
|
||||
# block main crawler coroutine
|
||||
await asyncio.Event().wait()
|
||||
|
||||
async def update_cookies(self):
|
||||
self.cookies = await self.browser_context.cookies()
|
||||
|
||||
async def login(self):
|
||||
pass
|
||||
|
||||
def search_posts(self):
|
||||
pass
|
||||
|
||||
def get_comments(self, item_id: str):
|
||||
pass
|
||||
Reference in New Issue
Block a user