mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-03-16 08:20:50 +08:00
feat: 百度贴吧done
This commit is contained in:
@@ -1,17 +1,15 @@
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
from tenacity import (RetryError, retry, stop_after_attempt,
|
||||
wait_fixed)
|
||||
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
from tools import utils
|
||||
|
||||
@@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise e
|
||||
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
@@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||
|
||||
all_sub_comments: List[TiebaComment] = []
|
||||
for comment in comments:
|
||||
if comment.sub_comment_count == 0:
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = comment.sub_comment_count // 10 + 1
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
while max_sub_page_num >= current_page:
|
||||
params = {
|
||||
"tid": comment.note_id, # 帖子ID
|
||||
"pid": comment.comment_id, # 父级评论ID
|
||||
"fid": comment.tieba_id, # 贴吧ID
|
||||
"tid": parment_comment.note_id, # 帖子ID
|
||||
"pid": parment_comment.comment_id, # 父级评论ID
|
||||
"fid": parment_comment.tieba_id, # 贴吧ID
|
||||
"pn": current_page # 页码
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
|
||||
parent_comment=comment)
|
||||
parent_comment=parment_comment)
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(comment.note_id, sub_comments)
|
||||
await callback(parment_comment.note_id, sub_comments)
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
||||
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表
|
||||
Args:
|
||||
tieba_name: 贴吧名称
|
||||
page_num: 分页数量
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||
|
||||
@@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
await self.get_specified_tieba_notes()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
@@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
if not notes_list:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
|
||||
break
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}")
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
|
||||
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
|
||||
page += 1
|
||||
except Exception as ex:
|
||||
@@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler):
|
||||
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
|
||||
break
|
||||
|
||||
async def get_specified_tieba_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post by tieba name
|
||||
Returns:
|
||||
|
||||
"""
|
||||
tieba_limit_count = 50
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
for tieba_name in config.TIEBA_NAME_LIST:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
|
||||
page_number = 0
|
||||
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
|
||||
tieba_name=tieba_name,
|
||||
page_num=page_number
|
||||
)
|
||||
if not note_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
|
||||
break
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
|
||||
await self.get_specified_notes([note.note_id for note in note_list])
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import json
|
||||
import html
|
||||
from typing import List, Dict, Tuple
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
||||
from constant import baidu_tieba as const
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from tools import utils
|
||||
|
||||
|
||||
@@ -43,6 +43,42 @@ class TieBaExtractor:
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
xpath_selector = "//ul[@id='thread_list']/li"
|
||||
post_list = content_selector.xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post_selector in post_list:
|
||||
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||
if not post_field_value:
|
||||
continue
|
||||
note_id = str(post_field_value.get("id"))
|
||||
tieba_note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||
desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||
default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(
|
||||
default=''),
|
||||
total_replay_num=post_field_value.get("reply_num", 0)
|
||||
)
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||
"""
|
||||
提取贴吧帖子详情
|
||||
@@ -124,8 +160,7 @@ class TieBaExtractor:
|
||||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
|
||||
def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Args:
|
||||
@@ -144,7 +179,8 @@ class TieBaExtractor:
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
content = utils.extract_text_from_html(
|
||||
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")),
|
||||
content=content,
|
||||
@@ -227,6 +263,7 @@ def test_extract_tieba_note_parment_comments():
|
||||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_sub_comments():
|
||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
@@ -244,11 +281,21 @@ def test_extract_tieba_note_sub_comments():
|
||||
tieba_id="tieba_id",
|
||||
tieba_name="tieba_name",
|
||||
)
|
||||
result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment)
|
||||
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_list():
|
||||
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_list(content)
|
||||
print(result)
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test_extract_search_note_list()
|
||||
# test_extract_note_detail()
|
||||
# test_extract_tieba_note_parment_comments()
|
||||
test_extract_tieba_note_sub_comments()
|
||||
test_extract_tieba_note_list()
|
||||
|
||||
3627
media_platform/tieba/test_data/tieba_note_list.html
Normal file
3627
media_platform/tieba/test_data/tieba_note_list.html
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user