refactor: 优化抖音Crawler部分代码

fix: 日志初始化错误修复
This commit is contained in:
Relakkes
2023-07-15 21:30:12 +08:00
parent dad8d56ab5
commit 2398a17e21
10 changed files with 186 additions and 152 deletions

View File

@@ -6,9 +6,11 @@ import httpx
import execjs
import urllib.parse
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from .field import *
from .exception import *
from tools import utils
class DOUYINClient:
@@ -33,7 +35,6 @@ class DOUYINClient:
headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage")
douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
# douyin_js_obj = execjs.compile(open('libs/X-Bogus.js').read())
common_params = {
"device_platform": "webapp",
"aid": "6383",
@@ -82,6 +83,17 @@ class DOUYINClient:
headers = headers or self.headers
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
@staticmethod
async def ping(browser_context: BrowserContext) -> bool:
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
# todo send some api to test login status
return cookie_dict.get("LOGIN_STATUS") == "1"
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def search_info_by_keyword(
self,
keyword: str,

View File

@@ -1,13 +1,14 @@
import logging
import os
import asyncio
import logging
from asyncio import Task
from argparse import Namespace
from typing import Optional, List, Dict, Tuple
from playwright.async_api import async_playwright
from playwright.async_api import Page
from playwright.async_api import Cookie
from playwright.async_api import BrowserType
from playwright.async_api import BrowserContext
from playwright.async_api import Page
import config
from tools import utils
@@ -21,12 +22,11 @@ from models import douyin
class DouYinCrawler(AbstractCrawler):
def __init__(self):
self.cookies: Optional[List[Cookie]] = None
self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.dy_client: Optional[DOUYINClient] = None
self.index_url = "https://www.douyin.com"
self.command_args: Optional[Namespace] = None
self.account_pool: Optional[AccountPool] = None
@@ -34,94 +34,45 @@ class DouYinCrawler(AbstractCrawler):
for key, value in kwargs.items():
setattr(self, key, value)
def create_proxy_info(self) -> Tuple[str, Dict, str]:
"""Create proxy info for playwright and httpx"""
# phone: 13012345671
# ip_proxy: 111.122.xx.xx1:8888
# 手机号和IP代理都是从账号池中获取的并且它们是固定绑定的
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def start(self):
# phone: 1340xxxx, ip_proxy: 47.xxx.xxx.xxx:8888
account_phone, ip_proxy = self.account_pool.get_account()
# 抖音平台如果开启代理登录的话,会被风控,所以这里不开启代理
playwright_proxy = None
# playwright_proxy = {
# "server": f"{config.ip_proxy_protocol}{ip_proxy}",
# "username": config.ip_proxy_user,
# "password": config.ip_proxy_password,
# }
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
if not config.ENABLE_IP_PROXY:
playwright_proxy = None
httpx_proxy = None
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
async with async_playwright() as playwright:
# Launch a browser context.
chromium = playwright.chromium
browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy)
self.browser_context = await browser.new_context(
viewport={"width": 1800, "height": 900},
user_agent=self.user_agent,
self.browser_context = await self.launch_browser(
chromium,
playwright_proxy,
self.user_agent,
headless=config.HEADLESS
)
# execute JS to bypass anti automation/crawler detection
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded")
await asyncio.sleep(3)
await self.context_page.goto(self.index_url)
# begin login
login_obj = DouYinLogin(
login_type=self.command_args.lt,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
)
await login_obj.begin()
# update cookies
await self.update_cookies()
# init request client
cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
self.dy_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
self.dy_client = await self.create_douyin_client(httpx_proxy)
if not await self.dy_client.ping(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=self.command_args.lt,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
)
await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context)
# search_posts
await self.search_posts()
# block main crawler coroutine
await asyncio.Event().wait()
async def update_cookies(self):
self.cookies = await self.browser_context.cookies()
utils.logger.info("Douyin Crawler finished ...")
async def search_posts(self):
logging.info("Begin search douyin keywords")
utils.logger.info("Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","):
logging.info(f"Current keyword: {keyword}")
utils.logger.info(f"Current keyword: {keyword}")
aweme_list: List[str] = []
max_note_len = 20
max_note_len = config.MAX_PAGE_NUM
page = 0
while max_note_len > 0:
try:
@@ -139,8 +90,8 @@ class DouYinCrawler(AbstractCrawler):
continue
aweme_list.append(aweme_info.get("aweme_id"))
await douyin.update_douyin_aweme(aweme_item=aweme_info)
print(f"keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
# await self.batch_get_note_comments(aweme_list)
async def batch_get_note_comments(self, aweme_list: List[str]):
task_list: List[Task] = []
@@ -155,6 +106,71 @@ class DouYinCrawler(AbstractCrawler):
aweme_id=aweme_id,
callback=douyin.batch_update_dy_aweme_comments
)
print(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
except DataFetchError as e:
logging.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
"""Create proxy info for playwright and httpx"""
if not config.ENABLE_IP_PROXY:
return None, None, None
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def create_douyin_client(self, httpx_proxy: str) -> DOUYINClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
douyin_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return douyin_client
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform)
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy,
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")

View File

@@ -8,13 +8,14 @@ from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result
retry_if_result,
RetryError
)
from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError
from playwright.async_api import BrowserContext
import config
from tools import utils, easing
from tools import utils
from base.base_crawler import AbstractLogin
@@ -54,21 +55,22 @@ class DouYinLogin(AbstractLogin):
raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...")
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
await asyncio.sleep(3)
await asyncio.sleep(6)
current_page_title = await self.context_page.title()
if "验证码中间页" in current_page_title:
await self.check_page_display_slider(move_step=3, slider_level="hard")
# check login state
logging.info(f"login finished then check login state ...")
login_flag: bool = await self.check_login_state()
if not login_flag:
logging.info("login failed please confirm ...")
utils.logger.info(f"login finished then check login state ...")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("login failed please confirm ...")
sys.exit()
# wait for redirect
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
@@ -88,31 +90,31 @@ class DouYinLogin(AbstractLogin):
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
except Exception as e:
logging.error(f"login dialog box does not pop up automatically, error: {e}")
logging.info("login dialog box does not pop up automatically, we will manually click the login button")
utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button")
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
await login_button_ele.click()
await asyncio.sleep(0.5)
async def login_by_qrcode(self):
logging.info("Begin login douyin by qrcode...")
utils.logger.info("Begin login douyin by qrcode...")
qrcode_img_selector = "xpath=//article[@class='web-login']//img"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
logging.info("login qrcode not found please confirm ...")
utils.logger.info("login qrcode not found please confirm ...")
sys.exit()
# show login qrcode
# utils.show_qrcode(base64_qrcode_img)
# utils.show_qrcode(base64_qrcode_img)
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.show_qrcode(base64_qrcode_img)
await asyncio.sleep(2)
async def login_by_mobile(self):
logging.info("Begin login douyin by mobile ...")
utils.logger.info("Begin login douyin by mobile ...")
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
await mobile_tap_ele.click()
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
@@ -128,7 +130,7 @@ class DouYinLogin(AbstractLogin):
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0:
logging.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
@@ -170,20 +172,20 @@ class DouYinLogin(AbstractLogin):
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
page_content = await self.context_page.content()
if "操作过慢" in page_content or "提示重新操作" in page_content:
logging.info("slider verify failed, retry ...")
utils.logger.info("slider verify failed, retry ...")
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
continue
# 滑动成功后,等待滑块消失
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
logging.info("slider verify success ...")
utils.logger.info("slider verify success ...")
slider_verify_success = True
except Exception as e:
logging.error(f"slider verify failed, error: {e}")
await asyncio.sleep(1)
max_slider_try_times -= 1
logging.info(f"remaining slider try times: {max_slider_try_times}")
utils.logger.info(f"remaining slider try times: {max_slider_try_times}")
continue
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
@@ -240,7 +242,7 @@ class DouYinLogin(AbstractLogin):
await self.context_page.mouse.up()
async def login_by_cookies(self):
logging.info("Begin login douyin by cookie ...")
utils.logger.info("Begin login douyin by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,