mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-07 18:37:30 +08:00
refactor: 优化代码
This commit is contained in:
@@ -1,38 +1,38 @@
|
||||
import os
|
||||
import asyncio
|
||||
import os
|
||||
from asyncio import Task
|
||||
from argparse import Namespace
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright.async_api import BrowserType
|
||||
from playwright.async_api import BrowserContext
|
||||
from playwright.async_api import Page
|
||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||
async_playwright)
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
from .client import DOUYINClient
|
||||
from .exception import DataFetchError
|
||||
from .login import DouYinLogin
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from base.proxy_account_pool import AccountPool
|
||||
from models import douyin
|
||||
from tools import utils
|
||||
|
||||
from .client import DOUYINClient
|
||||
from .exception import DataFetchError
|
||||
from .login import DouYinLogin
|
||||
|
||||
|
||||
class DouYinCrawler(AbstractCrawler):
|
||||
platform: str
|
||||
login_type: str
|
||||
context_page: Page
|
||||
dy_client: DOUYINClient
|
||||
account_pool: AccountPool
|
||||
browser_context: BrowserContext
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.browser_context: Optional[BrowserContext] = None # type: ignore
|
||||
self.context_page: Optional[Page] = None # type: ignore
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.command_args: Optional[Namespace] = None # type: ignore
|
||||
self.account_pool: Optional[AccountPool] = None # type: ignore
|
||||
|
||||
def init_config(self, **kwargs):
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.account_pool = account_pool
|
||||
|
||||
async def start(self) -> None:
|
||||
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
||||
@@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
self.dy_client = await self.create_douyin_client(httpx_proxy)
|
||||
if not await self.dy_client.ping(browser_context=self.browser_context):
|
||||
login_obj = DouYinLogin(
|
||||
login_type=self.command_args.lt, # type: ignore
|
||||
login_type=self.login_type,
|
||||
login_phone=account_phone,
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
@@ -63,25 +63,25 @@ class DouYinCrawler(AbstractCrawler):
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
# search_posts
|
||||
await self.search_posts()
|
||||
await self.search()
|
||||
|
||||
utils.logger.info("Douyin Crawler finished ...")
|
||||
|
||||
async def search_posts(self) -> None:
|
||||
async def search(self) -> None:
|
||||
utils.logger.info("Begin search douyin keywords")
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
max_note_len = config.MAX_PAGE_NUM
|
||||
dy_limite_count = 10 # douyin fixed limit page 10
|
||||
page = 0
|
||||
while max_note_len > 0:
|
||||
while (page + 1) * dy_limite_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
try:
|
||||
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, offset=page * 10)
|
||||
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
|
||||
offset=page * dy_limite_count)
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"search douyin keyword: {keyword} failed")
|
||||
break
|
||||
page += 1
|
||||
max_note_len -= 10
|
||||
for post_item in posts_res.get("data"):
|
||||
try:
|
||||
aweme_info: Dict = post_item.get("aweme_info") or \
|
||||
@@ -93,15 +93,15 @@ class DouYinCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]):
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
task_list: List[Task] = []
|
||||
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
for aweme_id in aweme_list:
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||||
task_list.append(task)
|
||||
await asyncio.wait(task_list)
|
||||
|
||||
async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
@@ -155,7 +155,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
"""Launch browser and create browser context"""
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
|
||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
@@ -173,7 +173,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def close(self):
|
||||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("Browser context closed ...")
|
||||
|
||||
Reference in New Issue
Block a user