temp commit

This commit is contained in:
Relakkes
2024-08-06 19:21:34 +08:00
parent d347cf5a2c
commit 1b585cb215
4 changed files with 7644 additions and 7 deletions

View File

@@ -32,7 +32,6 @@ class TieBaExtractor:
author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip()
author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='')
date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip()
result.append({
"note_id": post_id,
"title": title,
@@ -47,6 +46,25 @@ class TieBaExtractor:
return result
@staticmethod
def extract_note_detail(page_content: str) -> Dict:
"""
提取贴吧帖子详情
Args:
page_content:
Returns:
"""
content_selector = Selector(text=page_content)
# 查看楼主的链接: only_view_author_link: / p / 9117905169?see_lz = 1
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip() #
note_id = only_view_author_link.split("?")[0].split("/")[-1]
title = content_selector.xpath("//*[@id='j_core_title_wrap']/h3").get(default='').strip()
desc = content_selector.xpath("//meta[@name='description']").get(default='').strip()
note_url = f"/p/{note_id}"
pass
@staticmethod
def extract_tieba_note_comments(page_content: str) -> List[Dict]:
"""
@@ -57,7 +75,24 @@ class TieBaExtractor:
Returns:
"""
pass
xpath_selector = "//div[@id='j_p_postlist']/div[@class='l_post l_post_bright j_l_post clearfix']"
comment_list = Selector(text=page_content).xpath(xpath_selector)
result = []
for comment in comment_list:
comment_id = comment.xpath(".//@data-pid").get(default='').strip()
author = comment.xpath(".//a[@data-field]/text()").get(default='').strip()
author_link = comment.xpath(".//a[@data-field]/@href").get(default='')
content = comment.xpath(".//div[@class='d_post_content j_d_post_content ']/text()").get(default='').strip()
date = comment.xpath(".//span[@class='tail-info']/text()").get(default='').strip()
result.append({
"comment_id": comment_id,
"author": author,
"author_link": author_link,
"content": content,
"time": date,
})
if __name__ == '__main__':