fix: 列表项段落、引用段落换行的bug

This commit is contained in:
fengxxc
2023-08-27 11:31:04 +08:00
parent aa0e847749
commit c266359a76
4 changed files with 68 additions and 21 deletions

View File

@@ -1,6 +1,6 @@
BUILD_ENV := CGO_ENABLED=0 BUILD_ENV := CGO_ENABLED=0
APP=wechatmp2markdown APP=wechatmp2markdown
VERSION=v1.1.3 VERSION=v1.1.4
# linux or mac 环境编译 # linux or mac 环境编译
# make [cmd] # make [cmd]

View File

@@ -189,6 +189,8 @@ func formatContent(pieces []parse.Piece, depth int) (string, map[string][]byte)
// TODO // TODO
case parse.BR: case parse.BR:
pieceMdStr = " \n" pieceMdStr = " \n"
case parse.NULL:
continue
} }
contentMdStr += pieceMdStr contentMdStr += pieceMdStr
util.MergeMap(saveImageBytes, patchSaveImageBytes) util.MergeMap(saveImageBytes, patchSaveImageBytes)
@@ -240,25 +242,25 @@ func formatCodeBlock(piece parse.Piece) string {
// 图片地址为本身src // 图片地址为本身src
func formatImageInline(piece parse.Piece) string { func formatImageInline(piece parse.Piece) string {
return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\")" return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\") \n"
} }
// 图片地址为本地引用 // 图片地址为本地引用
func formatImageFileReferInline(alt string, refName string) string { func formatImageFileReferInline(alt string, refName string) string {
return "![" + alt + "](" + refName + ")" return "![" + alt + "](" + refName + ") \n"
} }
// 图片转成base64并插在原地 // 图片转成base64并插在原地
func formatImageBase64Inline(piece parse.Piece) string { func formatImageBase64Inline(piece parse.Piece) string {
return "![" + piece.Attrs["alt"] + "](data:image/png;base64," + piece.Val.(string) + ")" return "![" + piece.Attrs["alt"] + "](data:image/png;base64," + piece.Val.(string) + ") \n"
} }
// 图片地址为markdown内引用用于base64 // 图片地址为markdown内引用用于base64
func formatImageRefer(piece parse.Piece, index int) string { func formatImageRefer(piece parse.Piece, index int) string {
return "![" + piece.Attrs["alt"] + "][" + strconv.Itoa(index) + "]" return "![" + piece.Attrs["alt"] + "][" + strconv.Itoa(index) + "] \n"
} }
func formatLink(piece parse.Piece) string { func formatLink(piece parse.Piece) string {
var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ")" var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ") \n"
return linkMdStr return linkMdStr
} }

View File

@@ -1,5 +1,10 @@
package parse package parse
import (
"strconv"
"strings"
)
type Article struct { type Article struct {
Title Piece Title Piece
Meta []string Meta []string
@@ -7,6 +12,32 @@ type Article struct {
Content []Piece Content []Piece
} }
func (article Article) ToString() string {
return ToString(article.Content)
}
func ToString(pieces []Piece) string {
var res []string
for _, p := range pieces {
var val = "[null]"
switch p.Val.(type) {
case string:
val = p.Val.(string)
if len(val) > 90 {
val = val[:90]
}
res = append(res, "type: "+strconv.Itoa(int(p.Type))+", value: "+val+"\n")
case []Piece:
res = append(res, ToString(p.Val.([]Piece)))
default:
res = append(res, "type: "+strconv.Itoa(int(p.Type))+", value: "+val+"\n")
}
// fmt.Printf("%+v %+v\n", p.Type, val)
}
return strings.Join(res, "")
}
type Header struct { type Header struct {
Level int Level int
Text string Text string
@@ -39,4 +70,5 @@ const (
U_LIST // 13 无序列表 U_LIST // 13 无序列表
HR // 14 分隔线 HR // 14 分隔线
BR // 15 换行 BR // 15 换行
NULL // 无
) )

View File

@@ -15,9 +15,14 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { func parseSection(s *goquery.Selection, imagePolicy ImagePolicy, lastPieceType PieceType) []Piece {
var pieces []Piece var pieces []Piece
pieces = append(pieces, Piece{BR, nil, nil}) if lastPieceType == O_LIST || lastPieceType == U_LIST || lastPieceType == NULL || lastPieceType == BLOCK_QUOTES {
// pieces = append(pieces, Piece{NULL, nil, nil})
} else {
pieces = append(pieces, Piece{BR, nil, nil})
}
var _lastPieceType PieceType = NULL
s.Contents().Each(func(i int, sc *goquery.Selection) { s.Contents().Each(func(i int, sc *goquery.Selection) {
attr := make(map[string]string) attr := make(map[string]string)
if sc.Is("a") { if sc.Is("a") {
@@ -29,15 +34,15 @@ func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece {
attr["title"], _ = sc.Attr("title") attr["title"], _ = sc.Attr("title")
switch imagePolicy { switch imagePolicy {
case IMAGE_POLICY_URL: case IMAGE_POLICY_URL:
pieces = append(pieces, Piece{IMAGE, nil, attr}, Piece{BR, nil, nil}) pieces = append(pieces, Piece{IMAGE, nil, attr})
case IMAGE_POLICY_SAVE: case IMAGE_POLICY_SAVE:
image := fetchImgFile(attr["src"]) image := fetchImgFile(attr["src"])
pieces = append(pieces, Piece{IMAGE, image, attr}, Piece{BR, nil, nil}) pieces = append(pieces, Piece{IMAGE, image, attr})
case IMAGE_POLICY_BASE64: case IMAGE_POLICY_BASE64:
fallthrough fallthrough
default: default:
base64Image := img2base64(fetchImgFile(attr["src"])) base64Image := img2base64(fetchImgFile(attr["src"]))
pieces = append(pieces, Piece{IMAGE_BASE64, base64Image, attr}, Piece{BR, nil, nil}) pieces = append(pieces, Piece{IMAGE_BASE64, base64Image, attr})
} }
} else if sc.Is("ol") { } else if sc.Is("ol") {
pieces = append(pieces, parseList(sc, O_LIST, imagePolicy)...) pieces = append(pieces, parseList(sc, O_LIST, imagePolicy)...)
@@ -46,8 +51,13 @@ func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece {
} else if sc.Is("pre") || sc.Is("section.code-snippet__fix") { } else if sc.Is("pre") || sc.Is("section.code-snippet__fix") {
// 代码块 // 代码块
pieces = append(pieces, parsePre(sc)...) pieces = append(pieces, parsePre(sc)...)
} else if sc.Is("p") || sc.Is("section") || sc.Is("span") { } else if sc.Is("span") {
pieces = append(pieces, parseSection(sc, imagePolicy)...) pieces = append(pieces, parseSection(sc, imagePolicy, _lastPieceType)...)
} else if sc.Is("p") || sc.Is("section") {
pieces = append(pieces, parseSection(sc, imagePolicy, _lastPieceType)...)
if removeBrAndBlank(sc.Text()) != "" && len(pieces) > 0 && pieces[len(pieces)-1].Type != BR {
pieces = append(pieces, Piece{BR, nil, nil})
}
} else if sc.Is("h1") || sc.Is("h2") || sc.Is("h3") || sc.Is("h4") || sc.Is("h5") || sc.Is("h6") { } else if sc.Is("h1") || sc.Is("h2") || sc.Is("h3") || sc.Is("h4") || sc.Is("h5") || sc.Is("h6") {
pieces = append(pieces, parseHeader(sc)...) pieces = append(pieces, parseHeader(sc)...)
} else if sc.Is("blockquote") { } else if sc.Is("blockquote") {
@@ -55,10 +65,14 @@ func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece {
} else if sc.Is("strong") { } else if sc.Is("strong") {
pieces = append(pieces, parseStrong(sc)...) pieces = append(pieces, parseStrong(sc)...)
} else { } else {
pieces = append(pieces, Piece{NORMAL_TEXT, sc.Text(), nil}) if sc.Text() != "" {
pieces = append(pieces, Piece{NORMAL_TEXT, sc.Text(), nil})
}
}
if len(pieces) > 0 {
_lastPieceType = pieces[len(pieces)-1].Type
} }
}) })
pieces = append(pieces, Piece{BR, nil, nil})
return pieces return pieces
} }
@@ -80,7 +94,7 @@ func parseHeader(s *goquery.Selection) []Piece {
} }
attr := map[string]string{"level": strconv.Itoa(level)} attr := map[string]string{"level": strconv.Itoa(level)}
p := Piece{HEADER, removeBrAndBlank(s.Text()), attr} p := Piece{HEADER, removeBrAndBlank(s.Text()), attr}
return []Piece{p, {BR, nil, nil}} return []Piece{p}
} }
func parsePre(s *goquery.Selection) []Piece { func parsePre(s *goquery.Selection) []Piece {
@@ -89,22 +103,21 @@ func parsePre(s *goquery.Selection) []Piece {
codeRows = append(codeRows, sc.Text()) codeRows = append(codeRows, sc.Text())
}) })
p := Piece{CODE_BLOCK, codeRows, nil} p := Piece{CODE_BLOCK, codeRows, nil}
return []Piece{p, {BR, nil, nil}} return []Piece{p}
} }
func parseList(s *goquery.Selection, ptype PieceType, imagePolicy ImagePolicy) []Piece { func parseList(s *goquery.Selection, ptype PieceType, imagePolicy ImagePolicy) []Piece {
var list []Piece var list []Piece
s.Find("li").Each(func(i int, sc *goquery.Selection) { s.Find("li").Each(func(i int, sc *goquery.Selection) {
list = append(list, Piece{ptype, parseSection(sc, imagePolicy), nil}) list = append(list, Piece{ptype, parseSection(sc, imagePolicy, ptype), nil})
}) })
list = append(list, Piece{BR, nil, nil})
return list return list
} }
func parseBlockQuote(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { func parseBlockQuote(s *goquery.Selection, imagePolicy ImagePolicy) []Piece {
var bq []Piece var bq []Piece
s.Contents().Each(func(i int, sc *goquery.Selection) { s.Contents().Each(func(i int, sc *goquery.Selection) {
bq = append(bq, Piece{BLOCK_QUOTES, parseSection(sc, imagePolicy), nil}) bq = append(bq, Piece{BLOCK_QUOTES, parseSection(sc, imagePolicy, BLOCK_QUOTES), nil})
}) })
bq = append(bq, Piece{BR, nil, nil}) bq = append(bq, Piece{BR, nil, nil})
return bq return bq
@@ -170,7 +183,7 @@ func ParseFromReader(r io.Reader, imagePolicy ImagePolicy) Article {
// p[style="line-height: 1.5em;"] => 项目列表(有序/无序) // p[style="line-height: 1.5em;"] => 项目列表(有序/无序)
// section[style=".*text-align:center"]>img => 居中段落(图片) // section[style=".*text-align:center"]>img => 居中段落(图片)
content := mainContent.Find("#js_content") content := mainContent.Find("#js_content")
pieces := parseSection(content, imagePolicy) pieces := parseSection(content, imagePolicy, NULL)
article.Content = pieces article.Content = pieces
return article return article