feat: 知乎支持创作者主页数据爬取(回答、文章、视频)

This commit is contained in:
Relakkes
2024-10-16 21:02:27 +08:00
parent af9d2d8e84
commit da8f1c62b8
8 changed files with 511 additions and 66 deletions

View File

@@ -5,18 +5,19 @@ from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode
import httpx
from httpx import Response
from playwright.async_api import BrowserContext, Page
from tenacity import retry, stop_after_attempt, wait_fixed
import config
from base.base_crawler import AbstractApiClient
from constant import zhihu as zhihu_constant
from model.m_zhihu import ZhihuComment, ZhihuContent
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
from tools import utils
from .exception import DataFetchError, ForbiddenError
from .field import SearchSort, SearchTime, SearchType
from .help import ZhiHuJsonExtractor, sign
from .help import ZhihuExtractor, sign
class ZhiHuClient(AbstractApiClient):
@@ -33,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):
self.timeout = timeout
self.default_headers = headers
self.cookie_dict = cookie_dict
self._extractor = ZhiHuJsonExtractor()
self._extractor = ZhihuExtractor()
async def _pre_headers(self, url: str) -> Dict:
"""
@@ -95,7 +96,7 @@ class ZhiHuClient(AbstractApiClient):
raise DataFetchError(response.text)
async def get(self, uri: str, params=None) -> Dict:
async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]:
"""
GET请求对请求头签名
Args:
@@ -109,7 +110,7 @@ class ZhiHuClient(AbstractApiClient):
if isinstance(params, dict):
final_uri += '?' + urlencode(params)
headers = await self._pre_headers(final_uri)
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers)
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
async def pong(self) -> bool:
"""
@@ -194,7 +195,7 @@ class ZhiHuClient(AbstractApiClient):
}
search_res = await self.get(uri, params)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
return self._extractor.extract_contents(search_res)
return self._extractor.extract_contents_from_search(search_res)
async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
order_by: str = "sort") -> Dict:
@@ -317,3 +318,170 @@ class ZhiHuClient(AbstractApiClient):
all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval)
return all_sub_comments
async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]:
"""
获取创作者信息
Args:
url_token:
Returns:
"""
uri = f"/people/{url_token}"
html_content: str = await self.get(uri, return_response=True)
return self._extractor.extract_creator(url_token, html_content)
async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
"""
获取创作者的回答
Args:
url_token:
offset:
limit:
Returns:
"""
uri = f"/api/v4/members/{url_token}/answers"
params = {
"include":"data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,excerpt,paid_info,reaction_instruction,is_labeled,label_info,relationship.is_authorized,voting,is_author,is_thanked,is_nothelp;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;data[*].question.has_publishing_draft,relationship",
"offset": offset,
"limit": limit,
"order_by": "created"
}
return await self.get(uri, params)
async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
"""
获取创作者的文章
Args:
url_token:
offset:
limit:
Returns:
"""
uri = f"/api/v4/members/{url_token}/articles"
params = {
"include":"data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,reaction_instruction,is_labeled,label_info;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;",
"offset": offset,
"limit": limit,
"order_by": "created"
}
return await self.get(uri, params)
async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
"""
获取创作者的视频
Args:
url_token:
offset:
limit:
Returns:
"""
uri = f"/api/v4/members/{url_token}/zvideos"
params = {
"include":"similar_zvideo,creation_relationship,reaction_instruction",
"offset": offset,
"limit": limit,
"similar_aggregation": "true"
}
return await self.get(uri, params)
async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[ZhihuContent]:
"""
获取创作者的所有回答
Args:
creator: 创作者信息
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后
Returns:
"""
all_contents: List[ZhihuContent] = []
is_end: bool = False
offset: int = 0
limit: int = 20
while not is_end:
res = await self.get_creator_answers(creator.url_token, offset, limit)
if not res:
break
utils.logger.info(f"[ZhiHuClient.get_all_anwser_by_creator] Get creator {creator.url_token} answers: {res}")
paging_info = res.get("paging", {})
is_end = paging_info.get("is_end")
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
if callback:
await callback(contents)
all_contents.extend(contents)
offset += limit
await asyncio.sleep(crawl_interval)
return all_contents
async def get_all_articles_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[ZhihuContent]:
"""
获取创作者的所有文章
Args:
creator:
crawl_interval:
callback:
Returns:
"""
all_contents: List[ZhihuContent] = []
is_end: bool = False
offset: int = 0
limit: int = 20
while not is_end:
res = await self.get_creator_articles(creator.url_token, offset, limit)
if not res:
break
paging_info = res.get("paging", {})
is_end = paging_info.get("is_end")
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
if callback:
await callback(contents)
all_contents.extend(contents)
offset += limit
await asyncio.sleep(crawl_interval)
return all_contents
async def get_all_videos_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[ZhihuContent]:
"""
获取创作者的所有视频
Args:
creator:
crawl_interval:
callback:
Returns:
"""
all_contents: List[ZhihuContent] = []
is_end: bool = False
offset: int = 0
limit: int = 20
while not is_end:
res = await self.get_creator_videos(creator.url_token, offset, limit)
if not res:
break
paging_info = res.get("paging", {})
is_end = paging_info.get("is_end")
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
if callback:
await callback(contents)
all_contents.extend(contents)
offset += limit
await asyncio.sleep(crawl_interval)
return all_contents

View File

@@ -10,7 +10,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page,
import config
from base.base_crawler import AbstractCrawler
from model.m_zhihu import ZhihuContent
from model.m_zhihu import ZhihuContent, ZhihuCreator
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import zhihu as zhihu_store
from tools import utils
@@ -18,7 +18,7 @@ from var import crawler_type_var, source_keyword_var
from .client import ZhiHuClient
from .exception import DataFetchError
from .help import ZhiHuJsonExtractor
from .help import ZhihuExtractor
from .login import ZhiHuLogin
@@ -31,7 +31,7 @@ class ZhihuCrawler(AbstractCrawler):
self.index_url = "https://www.zhihu.com"
# self.user_agent = utils.get_user_agent()
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
self._extractor = ZhiHuJsonExtractor()
self._extractor = ZhihuExtractor()
async def start(self) -> None:
"""
@@ -74,7 +74,7 @@ class ZhihuCrawler(AbstractCrawler):
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API单独的首页不行
utils.logger.info("[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies过程需要5秒左右")
utils.logger.info("[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies过程需要5秒左右")
await self.context_page.goto(f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content")
await asyncio.sleep(5)
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
@@ -88,7 +88,7 @@ class ZhihuCrawler(AbstractCrawler):
raise NotImplementedError
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
raise NotImplementedError
await self.get_creators_and_notes()
else:
pass
@@ -169,6 +169,53 @@ class ZhihuCrawler(AbstractCrawler):
callback=zhihu_store.batch_update_zhihu_note_comments
)
async def get_creators_and_notes(self) -> None:
"""
Get creator's information and their notes and comments
Returns:
"""
utils.logger.info("[ZhihuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
for user_link in config.ZHIHU_CREATOR_URL_LIST:
utils.logger.info(f"[ZhihuCrawler.get_creators_and_notes] Begin get creator {user_link}")
user_url_token = user_link.split("/")[-1]
# get creator detail info from web html content
createor_info: ZhihuCreator = await self.zhihu_client.get_creator_info(url_token=user_url_token)
if not createor_info:
utils.logger.info(f"[ZhihuCrawler.get_creators_and_notes] Creator {user_url_token} not found")
continue
utils.logger.info(f"[ZhihuCrawler.get_creators_and_notes] Creator info: {createor_info}")
await zhihu_store.save_creator(creator=createor_info)
# 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可
# Get all anwser information of the creator
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
creator=createor_info,
crawl_interval=random.random(),
callback=zhihu_store.batch_update_zhihu_contents
)
# Get all articles of the creator's contents
# all_content_list = await self.zhihu_client.get_all_articles_by_creator(
# creator=createor_info,
# crawl_interval=random.random(),
# callback=zhihu_store.batch_update_zhihu_contents
# )
# Get all videos of the creator's contents
# all_content_list = await self.zhihu_client.get_all_videos_by_creator(
# creator=createor_info,
# crawl_interval=random.random(),
# callback=zhihu_store.batch_update_zhihu_contents
# )
# Get all comments of the creator's contents
await self.batch_get_content_comments(all_content_list)
@staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""

View File

@@ -1,8 +1,10 @@
# -*- coding: utf-8 -*-
from typing import Dict, List
import json
from typing import Dict, List, Optional
from urllib.parse import parse_qs, urlparse
import execjs
from parsel import Selector
from constant import zhihu as zhihu_constant
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
@@ -29,11 +31,11 @@ def sign(url: str, cookies: str) -> Dict:
return ZHIHU_SGIN_JS.call("get_sign", url, cookies)
class ZhiHuJsonExtractor:
class ZhihuExtractor:
def __init__(self):
pass
def extract_contents(self, json_data: Dict) -> List[ZhihuContent]:
def extract_contents_from_search(self, json_data: Dict) -> List[ZhihuContent]:
"""
extract zhihu contents
Args:
@@ -45,21 +47,34 @@ class ZhiHuJsonExtractor:
if not json_data:
return []
result: List[ZhihuContent] = []
search_result: List[Dict] = json_data.get("data", [])
search_result = [s_item for s_item in search_result if s_item.get("type") in ['search_result', 'zvideo']]
for sr_item in search_result:
sr_object: Dict = sr_item.get("object", {})
if sr_object.get("type") == zhihu_constant.ANSWER_NAME:
result.append(self._extract_answer_content(sr_object))
elif sr_object.get("type") == zhihu_constant.ARTICLE_NAME:
result.append(self._extract_article_content(sr_object))
elif sr_object.get("type") == zhihu_constant.VIDEO_NAME:
result.append(self._extract_zvideo_content(sr_object))
return self._extract_content_list([sr_item.get("object") for sr_item in search_result if sr_item.get("object")])
def _extract_content_list(self, content_list: List[Dict]) -> List[ZhihuContent]:
"""
extract zhihu content list
Args:
content_list:
Returns:
"""
if not content_list:
return []
res: List[ZhihuContent] = []
for content in content_list:
if content.get("type") == zhihu_constant.ANSWER_NAME:
res.append(self._extract_answer_content(content))
elif content.get("type") == zhihu_constant.ARTICLE_NAME:
res.append(self._extract_article_content(content))
elif content.get("type") == zhihu_constant.VIDEO_NAME:
res.append(self._extract_zvideo_content(content))
else:
continue
return result
return res
def _extract_answer_content(self, answer: Dict) -> ZhihuContent:
"""
@@ -72,22 +87,23 @@ class ZhiHuJsonExtractor:
res = ZhihuContent()
res.content_id = answer.get("id")
res.content_type = answer.get("type")
res.content_text = extract_text_from_html(answer.get("content"))
res.content_text = extract_text_from_html(answer.get("content", ""))
res.question_id = answer.get("question").get("id")
res.content_url = f"{zhihu_constant.ZHIHU_URL}/question/{res.question_id}/answer/{res.content_id}"
res.title = extract_text_from_html(answer.get("title"))
res.desc = extract_text_from_html(answer.get("description"))
res.title = extract_text_from_html(answer.get("title", ""))
res.desc = extract_text_from_html(answer.get("description", "") or answer.get("excerpt", ""))
res.created_time = answer.get("created_time")
res.updated_time = answer.get("updated_time")
res.voteup_count = answer.get("voteup_count")
res.comment_count = answer.get("comment_count")
res.voteup_count = answer.get("voteup_count", 0)
res.comment_count = answer.get("comment_count", 0)
# extract author info
author_info = self._extract_author(answer.get("author"))
author_info = self._extract_content_or_comment_author(answer.get("author"))
res.user_id = author_info.user_id
res.user_link = author_info.user_link
res.user_nickname = author_info.user_nickname
res.user_avatar = author_info.user_avatar
res.user_url_token = author_info.url_token
return res
def _extract_article_content(self, article: Dict) -> ZhihuContent:
@@ -106,17 +122,18 @@ class ZhiHuJsonExtractor:
res.content_url = f"{zhihu_constant.ZHIHU_URL}/p/{res.content_id}"
res.title = extract_text_from_html(article.get("title"))
res.desc = extract_text_from_html(article.get("excerpt"))
res.created_time = article.get("created_time")
res.updated_time = article.get("updated_time")
res.voteup_count = article.get("voteup_count")
res.comment_count = article.get("comment_count")
res.created_time = article.get("created_time", 0) or article.get("created", 0)
res.updated_time = article.get("updated_time", 0) or article.get("updated", 0)
res.voteup_count = article.get("voteup_count", 0)
res.comment_count = article.get("comment_count", 0)
# extract author info
author_info = self._extract_author(article.get("author"))
author_info = self._extract_content_or_comment_author(article.get("author"))
res.user_id = author_info.user_id
res.user_link = author_info.user_link
res.user_nickname = author_info.user_nickname
res.user_avatar = author_info.user_avatar
res.user_url_token = author_info.url_token
return res
def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent:
@@ -129,25 +146,34 @@ class ZhiHuJsonExtractor:
"""
res = ZhihuContent()
res.content_id = zvideo.get("zvideo_id")
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
res.content_id = zvideo.get("video").get("video_id")
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
res.created_time = zvideo.get("published_at")
res.updated_time = zvideo.get("updated_at")
else:
res.content_id = zvideo.get("zvideo_id")
res.content_url = zvideo.get("video_url")
res.created_time = zvideo.get("created_at")
res.content_type = zvideo.get("type")
res.content_url = zvideo.get("video_url")
res.title = extract_text_from_html(zvideo.get("title"))
res.desc = extract_text_from_html(zvideo.get("description"))
res.created_time = zvideo.get("created_at")
res.voteup_count = zvideo.get("voteup_count")
res.comment_count = zvideo.get("comment_count")
# extract author info
author_info = self._extract_author(zvideo.get("author"))
author_info = self._extract_content_or_comment_author(zvideo.get("author"))
res.user_id = author_info.user_id
res.user_link = author_info.user_link
res.user_nickname = author_info.user_nickname
res.user_avatar = author_info.user_avatar
res.user_url_token = author_info.url_token
return res
@staticmethod
def _extract_author(author: Dict) -> ZhihuCreator:
def _extract_content_or_comment_author(author: Dict) -> ZhihuCreator:
"""
extract zhihu author
Args:
@@ -165,6 +191,7 @@ class ZhiHuJsonExtractor:
res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{author.get('url_token')}"
res.user_nickname = author.get("name")
res.user_avatar = author.get("avatar_url")
res.url_token = author.get("url_token")
return res
def extract_comments(self, page_content: ZhihuContent, comments: List[Dict]) -> List[ZhihuComment]:
@@ -209,7 +236,7 @@ class ZhiHuJsonExtractor:
res.content_type = page_content.content_type
# extract author info
author_info = self._extract_author(comment.get("author"))
author_info = self._extract_content_or_comment_author(comment.get("author"))
res.user_id = author_info.user_id
res.user_link = author_info.user_link
res.user_nickname = author_info.user_nickname
@@ -254,3 +281,80 @@ class ZhiHuJsonExtractor:
query_params = parse_qs(parsed_url.query)
offset = query_params.get('offset', [""])[0]
return offset
@staticmethod
def _foramt_gender_text(gender: int) -> str:
"""
format gender text
Args:
gender:
Returns:
"""
if gender == 1:
return ""
elif gender == 0:
return ""
else:
return "未知"
def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]:
"""
extract zhihu creator
Args:
user_url_token : zhihu creator url token
html_content: zhihu creator html content
Returns:
"""
if not html_content:
return None
js_init_data = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="").strip()
if not js_init_data:
return None
js_init_data_dict: Dict = json.loads(js_init_data)
users_info: Dict = js_init_data_dict.get("initialState", {}).get("entities", {}).get("users", {})
if not users_info:
return None
creator_info: Dict = users_info.get(user_url_token)
if not creator_info:
return None
res = ZhihuCreator()
res.user_id = creator_info.get("id")
res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{user_url_token}"
res.user_nickname = creator_info.get("name")
res.user_avatar = creator_info.get("avatarUrl")
res.url_token = creator_info.get("urlToken") or user_url_token
res.gender = self._foramt_gender_text(creator_info.get("gender"))
res.ip_location = creator_info.get("ipInfo")
res.follows = creator_info.get("followingCount")
res.fans = creator_info.get("followerCount")
res.anwser_count = creator_info.get("answerCount")
res.video_count = creator_info.get("zvideoCount")
res.question_count = creator_info.get("questionCount")
res.article_count = creator_info.get("articlesCount")
res.column_count = creator_info.get("columnsCount")
res.get_voteup_count = creator_info.get("voteupCount")
return res
def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[ZhihuContent]:
"""
extract content list from creator
Args:
anwser_list:
Returns:
"""
if not anwser_list:
return []
return self._extract_content_list(anwser_list)