refactor: 优化代码

This commit is contained in:
Relakkes
2023-07-29 15:35:40 +08:00
parent febbb133d7
commit 4ff2cf8661
17 changed files with 133 additions and 140 deletions

View File

@@ -1,17 +1,17 @@
import copy
import asyncio
from typing import Optional, Dict, Callable
import httpx
import execjs
import copy
import urllib.parse
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from typing import Callable, Dict, Optional
import execjs
import httpx
from playwright.async_api import BrowserContext, Page
from .field import *
from .exception import *
from tools import utils
from .exception import *
from .field import *
class DOUYINClient:
def __init__(

View File

@@ -1,38 +1,38 @@
import os
import asyncio
import os
from asyncio import Task
from argparse import Namespace
from typing import Optional, List, Dict, Tuple
from typing import Dict, List, Optional, Tuple
from playwright.async_api import async_playwright
from playwright.async_api import BrowserType
from playwright.async_api import BrowserContext
from playwright.async_api import Page
from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)
import config
from tools import utils
from .client import DOUYINClient
from .exception import DataFetchError
from .login import DouYinLogin
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import douyin
from tools import utils
from .client import DOUYINClient
from .exception import DataFetchError
from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler):
platform: str
login_type: str
context_page: Page
dy_client: DOUYINClient
account_pool: AccountPool
browser_context: BrowserContext
def __init__(self) -> None:
self.browser_context: Optional[BrowserContext] = None # type: ignore
self.context_page: Optional[Page] = None # type: ignore
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com"
self.command_args: Optional[Namespace] = None # type: ignore
self.account_pool: Optional[AccountPool] = None # type: ignore
def init_config(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
self.platform = platform
self.login_type = login_type
self.account_pool = account_pool
async def start(self) -> None:
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
self.dy_client = await self.create_douyin_client(httpx_proxy)
if not await self.dy_client.ping(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=self.command_args.lt, # type: ignore
login_type=self.login_type,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
@@ -63,25 +63,25 @@ class DouYinCrawler(AbstractCrawler):
await self.dy_client.update_cookies(browser_context=self.browser_context)
# search_posts
await self.search_posts()
await self.search()
utils.logger.info("Douyin Crawler finished ...")
async def search_posts(self) -> None:
async def search(self) -> None:
utils.logger.info("Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current keyword: {keyword}")
aweme_list: List[str] = []
max_note_len = config.MAX_PAGE_NUM
dy_limite_count = 10 # douyin fixed limit page 10
page = 0
while max_note_len > 0:
while (page + 1) * dy_limite_count <= config.CRAWLER_MAX_NOTES_COUNT:
try:
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, offset=page * 10)
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limite_count)
except DataFetchError:
utils.logger.error(f"search douyin keyword: {keyword} failed")
break
page += 1
max_note_len -= 10
for post_item in posts_res.get("data"):
try:
aweme_info: Dict = post_item.get("aweme_info") or \
@@ -93,15 +93,15 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def batch_get_note_comments(self, aweme_list: List[str]):
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
task_list: List[Task] = []
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
task_list.append(task)
await asyncio.wait(task_list)
async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
async with semaphore:
try:
await self.dy_client.get_aweme_all_comments(
@@ -155,7 +155,7 @@ class DouYinCrawler(AbstractCrawler):
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
config.USER_DATA_DIR % self.platform) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
@@ -173,7 +173,7 @@ class DouYinCrawler(AbstractCrawler):
)
return browser_context
async def close(self):
async def close(self) -> None:
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")

View File

@@ -1,22 +1,17 @@
import sys
import asyncio
import functools
import sys
from typing import Optional
import aioredis
from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result,
RetryError
)
from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError
from playwright.async_api import BrowserContext
from playwright.async_api import BrowserContext, Page
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from tools import utils
from base.base_crawler import AbstractLogin
from tools import utils
class DouYinLogin(AbstractLogin):

View File

@@ -1,16 +1,16 @@
import json
import asyncio
from typing import Optional, Dict
import json
from typing import Dict, Optional
import httpx
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from playwright.async_api import BrowserContext, Page
from .help import sign, get_search_id
from .field import SearchSortType, SearchNoteType
from .exception import DataFetchError, IPBlockError
from tools import utils
from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType
from .help import get_search_id, sign
class XHSClient:
def __init__(

View File

@@ -1,41 +1,41 @@
import asyncio
import os
import random
import asyncio
from asyncio import Task
from typing import Optional, List, Dict, Tuple
from argparse import Namespace
from typing import Dict, List, Optional, Tuple
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from playwright.async_api import async_playwright
from playwright.async_api import BrowserType
from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)
import config
from tools import utils
from .exception import *
from .login import XHSLogin
from .client import XHSClient
from models import xiaohongshu as xhs_model
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import xiaohongshu as xhs_model
from tools import utils
from .client import XHSClient
from .exception import DataFetchError
from .login import XHSLogin
class XiaoHongShuCrawler(AbstractCrawler):
platform: str
login_type: str
context_page: Page
browser_context: BrowserContext
xhs_client: XHSClient
account_pool: AccountPool
browser_context: BrowserContext
def __init__(self):
def __init__(self) -> None:
self.index_url = "https://www.xiaohongshu.com"
self.command_args: Optional[Namespace] = None # type: ignore
self.user_agent = utils.get_user_agent()
def init_config(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
self.platform = platform
self.login_type = login_type
self.account_pool = account_pool
async def start(self):
async def start(self) -> None:
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
async with async_playwright() as playwright:
# Launch a browser context.
@@ -62,7 +62,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.xhs_client = await self.create_xhs_client(httpx_proxy)
if not await self.xhs_client.ping():
login_obj = XHSLogin(
login_type=self.command_args.lt,
login_type=self.login_type,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
@@ -72,28 +72,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
await self.xhs_client.update_cookies(browser_context=self.browser_context)
# Search for notes and retrieve their comment information.
await self.search_posts()
await self.search()
utils.logger.info("Xhs Crawler finished ...")
async def search_posts(self) -> None:
async def search(self) -> None:
"""Search for notes and retrieve their comment information."""
utils.logger.info("Begin search xiaohongshu keywords")
xhs_limit_count = 20 # xhs limit page fixed value
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current keyword: {keyword}")
max_note_len = config.MAX_PAGE_NUM
page = 1
while max_note_len > 0:
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
note_id_list: List[str] = []
posts_res = await self.xhs_client.get_note_by_keyword(
notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword,
page=page,
)
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail(post_item.get("id"), _semaphore)
for post_item in posts_res.get("items", {})
self.get_note_detail(post_item.get("id"), semaphore)
for post_item in notes_res.get("items", {})
]
note_details = await asyncio.gather(*task_list)
for note_detail in note_details:
@@ -101,11 +100,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
await xhs_model.update_xhs_note(note_detail)
note_id_list.append(note_detail.get("note_id"))
page += 1
max_note_len -= 20
utils.logger.info(f"Note details: {note_details}")
await self.batch_get_note_comments(note_id_list)
async def get_note_detail(self, note_id: str, semaphore: "asyncio.Semaphore") -> Optional[Dict]:
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
"""Get note detail"""
async with semaphore:
try:
@@ -117,14 +115,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments"""
utils.logger.info(f"Begin batch get note comments, note list: {note_list}")
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for note_id in note_list:
task = asyncio.create_task(self.get_comments(note_id, _semaphore), name=note_id)
task = asyncio.create_task(self.get_comments(note_id, semaphore), name=note_id)
task_list.append(task)
await asyncio.gather(*task_list)
async def get_comments(self, note_id: str, semaphore: "asyncio.Semaphore"):
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
"""Get note comments"""
async with semaphore:
utils.logger.info(f"Begin get note id comments {note_id}")
@@ -147,7 +145,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def create_xhs_client(self, httpx_proxy: str) -> XHSClient:
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
"""Create xhs client"""
utils.logger.info("Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
@@ -177,18 +175,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent

View File

@@ -1,21 +1,16 @@
import sys
import asyncio
import functools
import sys
from typing import Optional
import aioredis
from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result,
RetryError
)
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from tools import utils
from base.base_crawler import AbstractLogin
from tools import utils
class XHSLogin(AbstractLogin):
@@ -24,7 +19,7 @@ class XHSLogin(AbstractLogin):
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: str = "",
login_phone: Optional[str] = "",
cookie_str: str = ""
):
self.login_type = login_type