mirror of
https://github.com/fengxxc/wechatmp2markdown.git
synced 2026-02-20 22:10:45 +08:00
168 lines
4.1 KiB
Go
168 lines
4.1 KiB
Go
package parse
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"io/ioutil"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
func parseSection(s *goquery.Selection) Paragraph {
|
|
var piece []Piece
|
|
s.Children().Each(func(i int, s *goquery.Selection) {
|
|
var p Piece
|
|
attr := make(map[string]string)
|
|
if s.Is("span") {
|
|
p = Piece{NORMAL_TEXT, s.Text(), nil}
|
|
} else if s.Is("a") {
|
|
attr["href"], _ = s.Attr("href")
|
|
p = Piece{LINK, removeBrAndBlank(s.Text()), attr}
|
|
} else if s.Is("img") {
|
|
attr["src"], _ = s.Attr("src")
|
|
attr["alt"], _ = s.Attr("alt")
|
|
attr["title"], _ = s.Attr("title")
|
|
p = Piece{IMAGE, "", attr}
|
|
} else if s.Is("ol") {
|
|
// TODO
|
|
} else if s.Is("ul") {
|
|
// TODO
|
|
} else {
|
|
p = Piece{NORMAL_TEXT, s.Text(), nil}
|
|
// TODO
|
|
}
|
|
// fmt.Printf("%+v\n", t)
|
|
piece = append(piece, p)
|
|
})
|
|
return Paragraph{piece}
|
|
}
|
|
|
|
func parseHeader(s *goquery.Selection) Paragraph {
|
|
var level int
|
|
switch {
|
|
case s.Is("h1"):
|
|
level = 1
|
|
case s.Is("h2"):
|
|
level = 2
|
|
case s.Is("h3"):
|
|
level = 3
|
|
case s.Is("h4"):
|
|
level = 4
|
|
case s.Is("h5"):
|
|
level = 5
|
|
case s.Is("h6"):
|
|
level = 6
|
|
}
|
|
attr := map[string]string{"level": strconv.Itoa(level)}
|
|
p := Piece{HEADER, removeBrAndBlank(s.Text()), attr}
|
|
return Paragraph{[]Piece{p}}
|
|
}
|
|
|
|
func parsePre(s *goquery.Selection) Paragraph {
|
|
var codeRows []string
|
|
s.Find("code").Each(func(i int, s *goquery.Selection) {
|
|
codeRows = append(codeRows, s.Text())
|
|
})
|
|
p := Piece{CODE_BLOCK, codeRows, nil}
|
|
return Paragraph{[]Piece{p}}
|
|
}
|
|
|
|
func ParseFromReader(r io.Reader) Article {
|
|
var article Article
|
|
doc, err := goquery.NewDocumentFromReader(r)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
var mainContent *goquery.Selection = doc.Find("#img-content")
|
|
|
|
// 标题
|
|
title := mainContent.Find("#activity-name").Text()
|
|
attr := map[string]string{"level": "1"}
|
|
article.Title = Piece{HEADER, removeBrAndBlank(title), attr}
|
|
|
|
// meta 细节待完善
|
|
meta := mainContent.Find("#meta_content").Text()
|
|
meta = removeBrAndBlank(meta)
|
|
article.Meta = meta
|
|
|
|
// tags 细节待完善
|
|
tags := mainContent.Find("#js_tags").Text()
|
|
tags = removeBrAndBlank(tags)
|
|
article.Tags = tags
|
|
|
|
// content
|
|
// section[style="line-height: 1.5em;"]>span,a => 一般段落(含文本和超链接)
|
|
// p[style="line-height: 1.5em;"] => 项目列表(有序/无序)
|
|
// section[style=".*text-align:center"]>img => 居中段落(图片)
|
|
content := mainContent.Find("#js_content")
|
|
var sections []Paragraph
|
|
content.Children().Each(func(i int, s *goquery.Selection) {
|
|
var paragraph Paragraph
|
|
if s.Is("pre") || s.Is("section.code-snippet__fix") {
|
|
// 代码块
|
|
paragraph = parsePre(s)
|
|
} else if s.Is("p") || s.Is("section") {
|
|
paragraph = parseSection(s)
|
|
} else if s.Is("h1") || s.Is("h2") || s.Is("h3") || s.Is("h4") || s.Is("h5") || s.Is("h6") {
|
|
paragraph = parseHeader(s)
|
|
} else if s.Is("ol") {
|
|
// TODO
|
|
} else if s.Is("ul") {
|
|
// TODO
|
|
}
|
|
// sections[i] = block
|
|
sections = append(sections, paragraph)
|
|
})
|
|
article.Content = sections
|
|
|
|
return article
|
|
}
|
|
|
|
func ParseFromHTMLString(s string) Article {
|
|
return ParseFromReader(strings.NewReader(s))
|
|
}
|
|
|
|
func ParseFromHTMLFile(filepath string) Article {
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer file.Close()
|
|
content, err2 := ioutil.ReadAll(file)
|
|
if err2 != nil {
|
|
panic(err)
|
|
}
|
|
return ParseFromReader(bytes.NewReader(content))
|
|
}
|
|
|
|
func ParseFromURL(url string) Article {
|
|
res, err := http.Get(url)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer res.Body.Close()
|
|
if res.StatusCode != 200 {
|
|
log.Fatalf("get from url %s error: %d %s", url, res.StatusCode, res.Status)
|
|
}
|
|
return ParseFromReader(res.Body)
|
|
}
|
|
|
|
func removeBrAndBlank(s string) string {
|
|
regstr := "\\s{2,}"
|
|
reg, _ := regexp.Compile(regstr)
|
|
sb := make([]byte, len(s))
|
|
copy(sb, s)
|
|
spc_index := reg.FindStringIndex(string(sb)) //在字符串中搜索
|
|
for len(spc_index) > 0 { //找到适配项
|
|
sb = append(sb[:spc_index[0]+1], sb[spc_index[1]:]...) //删除多余空格
|
|
spc_index = reg.FindStringIndex(string(sb)) //继续在字符串中搜索
|
|
}
|
|
return strings.Replace(string(sb), "\n", " ", -1)
|
|
}
|