From c266359a7646fed7e5b4e4b5ad722cf101bcfb68 Mon Sep 17 00:00:00 2001 From: fengxxc Date: Sun, 27 Aug 2023 11:31:04 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=88=97=E8=A1=A8=E9=A1=B9=E6=AE=B5?= =?UTF-8?q?=E8=90=BD=E3=80=81=E5=BC=95=E7=94=A8=E6=AE=B5=E8=90=BD=E6=8D=A2?= =?UTF-8?q?=E8=A1=8C=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- format/format.go | 12 +++++++----- parse/model.go | 32 ++++++++++++++++++++++++++++++++ parse/parse.go | 43 ++++++++++++++++++++++++++++--------------- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 721b0f0..fc1643a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ BUILD_ENV := CGO_ENABLED=0 APP=wechatmp2markdown -VERSION=v1.1.3 +VERSION=v1.1.4 # linux or mac 环境编译 # make [cmd] diff --git a/format/format.go b/format/format.go index 56ad708..01085b2 100644 --- a/format/format.go +++ b/format/format.go @@ -189,6 +189,8 @@ func formatContent(pieces []parse.Piece, depth int) (string, map[string][]byte) // TODO case parse.BR: pieceMdStr = " \n" + case parse.NULL: + continue } contentMdStr += pieceMdStr util.MergeMap(saveImageBytes, patchSaveImageBytes) @@ -240,25 +242,25 @@ func formatCodeBlock(piece parse.Piece) string { // 图片地址为本身src func formatImageInline(piece parse.Piece) string { - return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\")" + return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\") \n" } // 图片地址为本地引用 func formatImageFileReferInline(alt string, refName string) string { - return "![" + alt + "](" + refName + ")" + return "![" + alt + "](" + refName + ") \n" } // 图片转成base64并插在原地 func formatImageBase64Inline(piece parse.Piece) string { - return "![" + piece.Attrs["alt"] + "](data:image/png;base64," + piece.Val.(string) + ")" + return "![" + piece.Attrs["alt"] + "](data:image/png;base64," + piece.Val.(string) + ") \n" } // 图片地址为markdown内引用(用于base64) func formatImageRefer(piece parse.Piece, index int) string { - return "![" + piece.Attrs["alt"] + "][" + strconv.Itoa(index) + "]" + return "![" + piece.Attrs["alt"] + "][" + strconv.Itoa(index) + "] \n" } func formatLink(piece parse.Piece) string { - var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ")" + var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ") \n" return linkMdStr } diff --git a/parse/model.go b/parse/model.go index 84e36ef..2dd5cdb 100644 --- a/parse/model.go +++ b/parse/model.go @@ -1,5 +1,10 @@ package parse +import ( + "strconv" + "strings" +) + type Article struct { Title Piece Meta []string @@ -7,6 +12,32 @@ type Article struct { Content []Piece } +func (article Article) ToString() string { + return ToString(article.Content) +} + +func ToString(pieces []Piece) string { + var res []string + for _, p := range pieces { + var val = "[null]" + + switch p.Val.(type) { + case string: + val = p.Val.(string) + if len(val) > 90 { + val = val[:90] + } + res = append(res, "type: "+strconv.Itoa(int(p.Type))+", value: "+val+"\n") + case []Piece: + res = append(res, ToString(p.Val.([]Piece))) + default: + res = append(res, "type: "+strconv.Itoa(int(p.Type))+", value: "+val+"\n") + } + // fmt.Printf("%+v %+v\n", p.Type, val) + } + return strings.Join(res, "") +} + type Header struct { Level int Text string @@ -39,4 +70,5 @@ const ( U_LIST // 13 无序列表 HR // 14 分隔线 BR // 15 换行 + NULL // 无 ) diff --git a/parse/parse.go b/parse/parse.go index 8e6f8f0..61df768 100644 --- a/parse/parse.go +++ b/parse/parse.go @@ -15,9 +15,14 @@ import ( "github.com/PuerkitoBio/goquery" ) -func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { +func parseSection(s *goquery.Selection, imagePolicy ImagePolicy, lastPieceType PieceType) []Piece { var pieces []Piece - pieces = append(pieces, Piece{BR, nil, nil}) + if lastPieceType == O_LIST || lastPieceType == U_LIST || lastPieceType == NULL || lastPieceType == BLOCK_QUOTES { + // pieces = append(pieces, Piece{NULL, nil, nil}) + } else { + pieces = append(pieces, Piece{BR, nil, nil}) + } + var _lastPieceType PieceType = NULL s.Contents().Each(func(i int, sc *goquery.Selection) { attr := make(map[string]string) if sc.Is("a") { @@ -29,15 +34,15 @@ func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { attr["title"], _ = sc.Attr("title") switch imagePolicy { case IMAGE_POLICY_URL: - pieces = append(pieces, Piece{IMAGE, nil, attr}, Piece{BR, nil, nil}) + pieces = append(pieces, Piece{IMAGE, nil, attr}) case IMAGE_POLICY_SAVE: image := fetchImgFile(attr["src"]) - pieces = append(pieces, Piece{IMAGE, image, attr}, Piece{BR, nil, nil}) + pieces = append(pieces, Piece{IMAGE, image, attr}) case IMAGE_POLICY_BASE64: fallthrough default: base64Image := img2base64(fetchImgFile(attr["src"])) - pieces = append(pieces, Piece{IMAGE_BASE64, base64Image, attr}, Piece{BR, nil, nil}) + pieces = append(pieces, Piece{IMAGE_BASE64, base64Image, attr}) } } else if sc.Is("ol") { pieces = append(pieces, parseList(sc, O_LIST, imagePolicy)...) @@ -46,8 +51,13 @@ func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { } else if sc.Is("pre") || sc.Is("section.code-snippet__fix") { // 代码块 pieces = append(pieces, parsePre(sc)...) - } else if sc.Is("p") || sc.Is("section") || sc.Is("span") { - pieces = append(pieces, parseSection(sc, imagePolicy)...) + } else if sc.Is("span") { + pieces = append(pieces, parseSection(sc, imagePolicy, _lastPieceType)...) + } else if sc.Is("p") || sc.Is("section") { + pieces = append(pieces, parseSection(sc, imagePolicy, _lastPieceType)...) + if removeBrAndBlank(sc.Text()) != "" && len(pieces) > 0 && pieces[len(pieces)-1].Type != BR { + pieces = append(pieces, Piece{BR, nil, nil}) + } } else if sc.Is("h1") || sc.Is("h2") || sc.Is("h3") || sc.Is("h4") || sc.Is("h5") || sc.Is("h6") { pieces = append(pieces, parseHeader(sc)...) } else if sc.Is("blockquote") { @@ -55,10 +65,14 @@ func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { } else if sc.Is("strong") { pieces = append(pieces, parseStrong(sc)...) } else { - pieces = append(pieces, Piece{NORMAL_TEXT, sc.Text(), nil}) + if sc.Text() != "" { + pieces = append(pieces, Piece{NORMAL_TEXT, sc.Text(), nil}) + } + } + if len(pieces) > 0 { + _lastPieceType = pieces[len(pieces)-1].Type } }) - pieces = append(pieces, Piece{BR, nil, nil}) return pieces } @@ -80,7 +94,7 @@ func parseHeader(s *goquery.Selection) []Piece { } attr := map[string]string{"level": strconv.Itoa(level)} p := Piece{HEADER, removeBrAndBlank(s.Text()), attr} - return []Piece{p, {BR, nil, nil}} + return []Piece{p} } func parsePre(s *goquery.Selection) []Piece { @@ -89,22 +103,21 @@ func parsePre(s *goquery.Selection) []Piece { codeRows = append(codeRows, sc.Text()) }) p := Piece{CODE_BLOCK, codeRows, nil} - return []Piece{p, {BR, nil, nil}} + return []Piece{p} } func parseList(s *goquery.Selection, ptype PieceType, imagePolicy ImagePolicy) []Piece { var list []Piece s.Find("li").Each(func(i int, sc *goquery.Selection) { - list = append(list, Piece{ptype, parseSection(sc, imagePolicy), nil}) + list = append(list, Piece{ptype, parseSection(sc, imagePolicy, ptype), nil}) }) - list = append(list, Piece{BR, nil, nil}) return list } func parseBlockQuote(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { var bq []Piece s.Contents().Each(func(i int, sc *goquery.Selection) { - bq = append(bq, Piece{BLOCK_QUOTES, parseSection(sc, imagePolicy), nil}) + bq = append(bq, Piece{BLOCK_QUOTES, parseSection(sc, imagePolicy, BLOCK_QUOTES), nil}) }) bq = append(bq, Piece{BR, nil, nil}) return bq @@ -170,7 +183,7 @@ func ParseFromReader(r io.Reader, imagePolicy ImagePolicy) Article { // p[style="line-height: 1.5em;"] => 项目列表(有序/无序) // section[style=".*text-align:center"]>img => 居中段落(图片) content := mainContent.Find("#js_content") - pieces := parseSection(content, imagePolicy) + pieces := parseSection(content, imagePolicy, NULL) article.Content = pieces return article