From 3d2ffb4ba9147cb48f54d8c25da147f2e6ed9e8b Mon Sep 17 00:00:00 2001 From: fengxxc Date: Sun, 28 Nov 2021 19:11:49 +0800 Subject: [PATCH] first commit --- go.mod | 8 +++ go.sum | 12 ++++ main.go | 7 ++ parse/model.go | 33 ++++++++++ parse/parse.go | 171 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 231 insertions(+) create mode 100644 go.mod create mode 100644 go.sum create mode 100644 main.go create mode 100644 parse/model.go create mode 100644 parse/parse.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..b1eba92 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module github.com/fengxxc/wechatmp2markdown + +go 1.16 + +require ( + github.com/PuerkitoBio/goquery v1.8.0 + golang.org/x/net v0.0.0-20211123203042-d83791d6bcd9 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..582ec38 --- /dev/null +++ b/go.sum @@ -0,0 +1,12 @@ +github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= +github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20211123203042-d83791d6bcd9 h1:0qxwC5n+ttVOINCBeRHO0nq9X7uy8SDsPoi5OaCdIEI= +golang.org/x/net v0.0.0-20211123203042-d83791d6bcd9/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/main.go b/main.go new file mode 100644 index 0000000..b578fa1 --- /dev/null +++ b/main.go @@ -0,0 +1,7 @@ +package main + +import "github.com/fengxxc/wechatmp2markdown/parse" + +func main() { + parse.Test() +} diff --git a/parse/model.go b/parse/model.go new file mode 100644 index 0000000..56c7d48 --- /dev/null +++ b/parse/model.go @@ -0,0 +1,33 @@ +package parse + +type Article struct { + title string + meta string + tags string + content []Block +} + +type Block struct { + tokens []Token +} + +type Token struct { + ttype TokenType + text string + attrs map[string]string +} + +type TokenType int32 + +const ( + TITLE TokenType = iota // 标题 + LINK // 链接 + NORMAL_TEXT // 文字 + STRONG_TEXT // 强调文字 + ITALIC_TEXT // 斜体文字 + IMAGE // 图片 + TABLE // 表格 + CODE_INLINE // 代码 内联 + CODE_BLOCK // 代码 块 + CITE // 引用 +) diff --git a/parse/parse.go b/parse/parse.go new file mode 100644 index 0000000..9bb0ee2 --- /dev/null +++ b/parse/parse.go @@ -0,0 +1,171 @@ +package parse + +import ( + "fmt" + "io" + "log" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func parseSection(s *goquery.Selection) Block { + // fmt.Printf("s.Length() = %d\n", s.Length()) + // fmt.Printf("s.Size() = %d\n", s.Size()) + // var tokens = make([]Token, s.Size()) + var tokens []Token + s.Children().Each(func(i int, s *goquery.Selection) { + var t Token + attr := make(map[string]string) + if s.Is("span") { + t = Token{NORMAL_TEXT, s.Text(), nil} + } else if s.Is("a") { + attr["href"], _ = s.Attr("href") + t = Token{LINK, removeBrAndBlank(s.Text()), attr} + } else if s.Is("img") { + attr["src"], _ = s.Attr("src") + t = Token{IMAGE, "", attr} + } else { + t = Token{NORMAL_TEXT, s.Text(), nil} + // TODO + } + // fmt.Printf("i = %d\n", i) + // fmt.Printf("%+v\n", t) + // tokens[i] = t + tokens = append(tokens, t) + }) + return Block{tokens} +} + +func ParseFromReader(r io.Reader) Article { + var article Article + doc, err := goquery.NewDocumentFromReader(r) + if err != nil { + log.Fatal(err) + } + var mainContent *goquery.Selection = doc.Find("#img-content") + + // 标题 + title := mainContent.Find("#activity-name").Text() + fmt.Println(title) + article.title = title + + // meta 细节待完善 + meta := mainContent.Find("#meta_content").Text() + meta = removeBrAndBlank(meta) + fmt.Println(meta) + article.meta = meta + + // tags 细节待完善 + tags := mainContent.Find("#js_tags").Text() + tags = removeBrAndBlank(tags) + fmt.Println(tags) + article.tags = tags + + // content + // section[style="line-height: 1.5em;"]>span,a => 一般段落(含文本和超链接) + // p[style="line-height: 1.5em;"] => 项目列表(有序/无序) + // section[style=".*text-align:center"]>img => 居中段落(图片) + content := mainContent.Find("#js_content") + var sections []Block + content.Find("section,p").Each(func(i int, s *goquery.Selection) { + fmt.Println(s.Text()) + // fmt.Println(s.Attr("style")) + var block Block + if s.Is("p") { + block = parseSection(s) + } else if s.Is("section") { + block = parseSection(s) + } else { + // TODO + } + // sections[i] = block + sections = append(sections, block) + }) + article.content = sections + + return article +} + +func ParseFromHTMLString(s string) Article { + return ParseFromReader(strings.NewReader(s)) +} + +func removeBrAndBlank(s string) string { + regstr := "\\s{2,}" + reg, _ := regexp.Compile(regstr) + sb := make([]byte, len(s)) + copy(sb, s) + spc_index := reg.FindStringIndex(string(sb)) //在字符串中搜索 + for len(spc_index) > 0 { //找到适配项 + sb = append(sb[:spc_index[0]+1], sb[spc_index[1]:]...) //删除多余空格 + spc_index = reg.FindStringIndex(string(sb)) //继续在字符串中搜索 + } + return strings.Replace(string(sb), "\n", " ", -1) +} + +var testHTML = ` +
+
+

终于有人喷我了!

+
+ + + 闪客sun + + + + 低并发编程 + + + 2021-11-26 +
+ +
+

写公众号一年了,一直盼着能有人喷喷我,今天终于被我碰到了!

+
‍‍
+
还是我的一位读者发现的,在推特上,于是分享给了我。
+
图片
+
+ 这是喷我最近的一个新系列, + + 你管这破玩意叫操作系统源码 + + ,正愁找不到借口推广一波呢,这不就给我来素材了。
+
+
+
+
+` + +func Test() { + res := ParseFromHTMLString(testHTML) + fmt.Println("---------------------------------------") + fmt.Printf("%+v\n", res) +}