feat: 标题、链接、图片、文本(加粗、倾斜)、代码块的解析和格式化

This commit is contained in:
fengxxc
2021-11-29 19:11:39 +08:00
parent ab6c8488d5
commit 79605c2817
5 changed files with 274 additions and 49 deletions

94
format/format.go Normal file
View File

@@ -0,0 +1,94 @@
package format
import (
"strconv"
"github.com/fengxxc/wechatmp2markdown/parse"
)
func Format(article parse.Article) string {
var result string
var titleMdStr string = formatTitle(article.Title)
result += titleMdStr
var metaMdStr string = formatMeta(article.Meta)
result += metaMdStr
var tagsMdStr string = formatTags(article.Tags)
result += tagsMdStr
var content string = formatContent(article.Content)
result += content
return result
}
func formatTitle(piece parse.Piece) string {
var prefix string
level, _ := strconv.Atoi(piece.Attrs["level"])
for i := 0; i < level; i++ {
prefix += "#"
}
return prefix + " " + piece.Val.(string) + " \n"
}
func formatMeta(meta string) string {
return meta + " \n" // TODO
}
func formatTags(tags string) string {
return tags + " \n" // TODO
}
func formatContent(blocks []parse.Paragraph) string {
var contentMdStr string
for _, block := range blocks {
for _, piece := range block.Pieces {
var pieceMdStr string
switch piece.Type {
case parse.HEADER:
pieceMdStr = formatTitle(piece)
case parse.LINK:
pieceMdStr = formatLink(piece)
case parse.NORMAL_TEXT:
pieceMdStr = piece.Val.(string)
case parse.BOLD_TEXT:
pieceMdStr = "**" + piece.Val.(string) + "**"
case parse.ITALIC_TEXT:
pieceMdStr = "*" + piece.Val.(string) + "*"
case parse.BOLD_ITALIC_TEXT:
pieceMdStr = "***" + piece.Val.(string) + "***"
case parse.IMAGE:
pieceMdStr = formatImage(piece)
case parse.TABLE:
// TODO
case parse.CODE_INLINE:
case parse.CODE_BLOCK:
pieceMdStr = formatCodeBlock(piece)
case parse.BLOCK_QUOTES:
case parse.O_LIST:
case parse.U_LIST:
case parse.HR:
}
contentMdStr += pieceMdStr
}
contentMdStr += " \n"
}
return contentMdStr
}
func formatCodeBlock(piece parse.Piece) string {
var codeMdStr string
codeMdStr += "```\n"
codeRows := piece.Val.([]string)
for _, row := range codeRows {
codeMdStr += row + "\n"
}
codeMdStr += "``` \n"
return codeMdStr
}
func formatImage(piece parse.Piece) string {
return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\")"
}
func formatLink(piece parse.Piece) string {
var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ")"
return linkMdStr
}

View File

@@ -1,33 +1,45 @@
package parse
type Article struct {
title string
meta string
tags string
content []Block
Title Piece
Meta string
Tags string
Content []Paragraph
}
type Block struct {
tokens []Token
type Header struct {
Level int
Text string
}
type Token struct {
ttype TokenType
text string
attrs map[string]string
type Paragraph struct {
Pieces []Piece
}
type TokenType int32
// go不资瓷泛型可真是难受...
type Value interface{}
type Piece struct {
Type PieceType
Val Value
Attrs map[string]string
}
type PieceType int32
const (
TITLE TokenType = iota // 标题
LINK // 链接
NORMAL_TEXT // 文字
STRONG_TEXT // 强调文字
ITALIC_TEXT // 斜体文字
IMAGE // 图片
TABLE // 表格
CODE_INLINE // 代码 内联
CODE_BLOCK // 代码
CITE // 引用
HEADER PieceType = iota // 标题
LINK // 链接
NORMAL_TEXT // 文字
BOLD_TEXT // 粗体文字
ITALIC_TEXT // 斜体文字
BOLD_ITALIC_TEXT // 粗斜体
IMAGE // 图片
TABLE // 表格
CODE_INLINE // 代码 内联
CODE_BLOCK // 代码 块
BLOCK_QUOTES // 引用
O_LIST // 有序列表
U_LIST // 无序列表
HR // 分隔线
)

View File

@@ -8,37 +8,75 @@ import (
"log"
"os"
"regexp"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
func parseSection(s *goquery.Selection) Block {
func parseSection(s *goquery.Selection) Paragraph {
// fmt.Printf("s.Length() = %d\n", s.Length())
// fmt.Printf("s.Size() = %d\n", s.Size())
// var tokens = make([]Token, s.Size())
var tokens []Token
// var piece = make([]Token, s.Size())
var piece []Piece
s.Children().Each(func(i int, s *goquery.Selection) {
var t Token
var p Piece
attr := make(map[string]string)
if s.Is("span") {
t = Token{NORMAL_TEXT, s.Text(), nil}
p = Piece{NORMAL_TEXT, s.Text(), nil}
} else if s.Is("a") {
attr["href"], _ = s.Attr("href")
t = Token{LINK, removeBrAndBlank(s.Text()), attr}
p = Piece{LINK, removeBrAndBlank(s.Text()), attr}
} else if s.Is("img") {
attr["src"], _ = s.Attr("src")
t = Token{IMAGE, "", attr}
attr["alt"], _ = s.Attr("alt")
attr["title"], _ = s.Attr("title")
p = Piece{IMAGE, "", attr}
} else if s.Is("ol") {
// TODO
} else if s.Is("ul") {
// TODO
} else {
t = Token{NORMAL_TEXT, s.Text(), nil}
p = Piece{NORMAL_TEXT, s.Text(), nil}
// TODO
}
// fmt.Printf("i = %d\n", i)
// fmt.Printf("%+v\n", t)
// tokens[i] = t
tokens = append(tokens, t)
piece = append(piece, p)
})
return Block{tokens}
return Paragraph{piece}
}
func parseHeader(s *goquery.Selection) Paragraph {
var level int
switch {
case s.Is("h1"):
level = 1
case s.Is("h2"):
level = 2
case s.Is("h3"):
level = 3
case s.Is("h4"):
level = 4
case s.Is("h5"):
level = 5
case s.Is("h6"):
level = 6
}
fmt.Println("***********" + strconv.Itoa(level))
attr := map[string]string{"level": strconv.Itoa(level)}
p := Piece{HEADER, removeBrAndBlank(s.Text()), attr}
return Paragraph{[]Piece{p}}
}
func parsePre(s *goquery.Selection) Paragraph {
var codeRows []string
s.Find("code").Each(func(i int, s *goquery.Selection) {
codeRows = append(codeRows, s.Text())
})
p := Piece{CODE_BLOCK, codeRows, nil}
return Paragraph{[]Piece{p}}
}
func ParseFromReader(r io.Reader) Article {
@@ -52,41 +90,47 @@ func ParseFromReader(r io.Reader) Article {
// 标题
title := mainContent.Find("#activity-name").Text()
fmt.Println(title)
article.title = title
attr := map[string]string{"level": "1"}
article.Title = Piece{HEADER, title, attr}
// meta 细节待完善
meta := mainContent.Find("#meta_content").Text()
meta = removeBrAndBlank(meta)
fmt.Println(meta)
article.meta = meta
article.Meta = meta
// tags 细节待完善
tags := mainContent.Find("#js_tags").Text()
tags = removeBrAndBlank(tags)
fmt.Println(tags)
article.tags = tags
article.Tags = tags
// content
// section[style="line-height: 1.5em;"]>span,a => 一般段落(含文本和超链接)
// p[style="line-height: 1.5em;"] => 项目列表(有序/无序)
// section[style=".*text-align:center"]>img => 居中段落(图片)
content := mainContent.Find("#js_content")
var sections []Block
content.Find("section,p").Each(func(i int, s *goquery.Selection) {
var sections []Paragraph
content.Children().Each(func(i int, s *goquery.Selection) {
fmt.Println(s.Text())
// fmt.Println(s.Attr("style"))
var block Block
if s.Is("p") {
block = parseSection(s)
} else if s.Is("section") {
block = parseSection(s)
} else {
var paragraph Paragraph
if s.Is("pre") || s.Is("section.code-snippet__fix") {
// 代码块
paragraph = parsePre(s)
} else if s.Is("p") || s.Is("section") {
paragraph = parseSection(s)
} else if s.Is("h1") || s.Is("h2") || s.Is("h3") || s.Is("h4") || s.Is("h5") || s.Is("h6") {
paragraph = parseHeader(s)
} else if s.Is("ol") {
// TODO
} else if s.Is("ul") {
// TODO
}
// sections[i] = block
sections = append(sections, block)
sections = append(sections, paragraph)
})
article.content = sections
article.Content = sections
return article
}

View File

@@ -2,12 +2,19 @@ package test
import (
"fmt"
"io/ioutil"
"github.com/fengxxc/wechatmp2markdown/format"
"github.com/fengxxc/wechatmp2markdown/parse"
)
func Test1() {
res := parse.ParseFromHTMLFile("./test/test1.html")
fmt.Println("-------------------test1.html-------------------")
fmt.Printf("%+v\n", res)
var articleStruct parse.Article = parse.ParseFromHTMLFile("./test/test1.html")
fmt.Println("-------------------test1.html parse-------------------")
fmt.Printf("%+v\n", articleStruct)
fmt.Println("-------------------test1.html format-------------------")
var mdString string = format.Format(articleStruct)
fmt.Print(mdString)
ioutil.WriteFile("./test/test1_target.md", []byte(mdString), 0644)
}

View File

@@ -53,14 +53,82 @@
<p data-mpa-powered-by="yiban.io"><span style="font-size: 16px;letter-spacing: 0.5px;background-color: transparent;caret-color: var(--weui-BRAND);">正文第一行</span><br></p>
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;"></span></section>
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;">正文第二行。<br></span></section>
<section style="line-height: 1.5em;text-align: center;"><img class="rich_pages wxw-img" data-galleryid="" data-ratio="1.0826306913996628" data-s="300,640" data-src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&amp;tp=webp&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0"></section>
<section style="line-height: 1.5em;text-align: center;"><img class="rich_pages wxw-img" data-galleryid="" data-ratio="1.0826306913996628" data-s="300,640" data-src="<img class="rich_pages wxw-img" data-galleryid="" data-ratio="0.625" data-s="300,640" data-src="https://mmbiz.qpic.cn/mmbiz_jpg/Z6bicxIx5naLuefnXicbRG06t9JMcnk5ZfO77yyhu2Mtv9037eWJnq0H5ALU1jibAUEm21mUsx0cq11B6oGpGE1jQ/640?wx_fmt=jpeg" data-type="jpeg" data-w="1024" style="width: 677px !important; height: auto !important; visibility: visible !important;" _width="677px" src="https://mmbiz.qpic.cn/mmbiz_jpg/Z6bicxIx5naLuefnXicbRG06t9JMcnk5ZfO77yyhu2Mtv9037eWJnq0H5ALU1jibAUEm21mUsx0cq11B6oGpGE1jQ/640?wx_fmt=jpeg&amp;tp=webp&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0">" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&amp;tp=webp&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0"></section>
<section style="line-height: 1.5em;">
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行part1文本</span>
<a target="_blank" href="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=Mzk0MjE3NDE0Ng==&amp;action=getalbum&amp;album_id=2123743679373688834#wechat_redirect" textvalue="你管这破玩意叫操作系统源码" linktype="text" imgurl="" imgdata="null" tab="innerlink" data-linktype="2" style="letter-spacing: 0.5px;font-size: 16px;" wah-hotarea="click">
<a target="_blank" href="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzIzOTU0NTQ0MA==&action=getalbum&album_id=1391790902901014528#wechat_redirect" textvalue="你管这破玩意叫操作系统源码" linktype="text" imgurl="" imgdata="null" tab="innerlink" data-linktype="2" style="letter-spacing: 0.5px;font-size: 16px;" wah-hotarea="click">
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行part2链接</span>
</a>
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行part3文本。<br></span>
</section>
<h2 style="line-height: 1.75em;">
<strong><span style="font-size: 15px;color: rgb(255, 106, 0);">&nbsp;JDK层</span></strong>
</h2>
<h3 style="line-height: 1.75em;">
<span style="font-size: 15px;color: rgb(255, 106, 0);">1&nbsp;AbstractQueuedSynchronizer</span>
</h3>
<!-- 代行号的代码块 -->
<section class="code-snippet__fix code-snippet__js">
<ul class="code-snippet__line-index code-snippet__js">
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
</ul>
<pre class="code-snippet__js" data-lang="cs">
<code><span class="code-snippet_outer">代码节选自java.util.concurrent.locks.ReentrantLock.java</span></code>
<code><span class="code-snippet_outer"> /** Synchronizer providing all implementation mechanics */</span></code>
<code><span class="code-snippet_outer"> private final Sync sync;</span></code>
<code><span class="code-snippet_outer"> /**</span></code>
<code><span class="code-snippet_outer"> * Base of synchronization control for this lock. Subclassed</span></code>
<code><span class="code-snippet_outer"> * into fair and nonfair versions below. Uses AQS state to</span></code>
<code><span class="code-snippet_outer"> * represent the number of holds on the lock.</span></code>
<code><span class="code-snippet_outer"> */</span></code>
<code><span class="code-snippet_outer"> abstract static class Sync extends AbstractQueuedSynchronizer {</span></code>
<code><span class="code-snippet_outer">......</span></code>
<code><span class="code-snippet_outer">}</span></code>
<code><span class="code-snippet_outer"><br></span></code>
<code><span class="code-snippet_outer"> public void lock() {</span></code>
<code><span class="code-snippet_outer"> sync.lock();</span></code>
<code><span class="code-snippet_outer"> }</span></code>
</pre>
</section>
<pre>
<section class="code-snippet__fix code-snippet__js">
<ul class="code-snippet__line-index code-snippet__js"><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li></ul>
<pre class="code-snippet__js" data-lang="java">
<code><span class="code-snippet_outer"> /**</span></code>
<code><span class="code-snippet_outer"> * Acquires in exclusive mode, ignoring interrupts. Implemented</span></code>
<code><span class="code-snippet_outer"> * by invoking at least once {@link #tryAcquire},</span></code>
<code><span class="code-snippet_outer"> * returning on success. Otherwise the thread is queued, possibly</span></code>
<code><span class="code-snippet_outer"> * repeatedly blocking and unblocking, invoking {@link</span></code>
<code><span class="code-snippet_outer"> * #tryAcquire} until success. This method can be used</span></code>
<code><span class="code-snippet_outer"> * to implement method {@link Lock#lock}.</span></code>
<code><span class="code-snippet_outer"> *</span></code>
<code><span class="code-snippet_outer"> * @param arg the acquire argument. This value is conveyed to</span></code>
<code><span class="code-snippet_outer"> * {@link #tryAcquire} but is otherwise uninterpreted and</span></code>
<code><span class="code-snippet_outer"> * can represent anything you like.</span></code>
<code><span class="code-snippet_outer"> */</span></code>
<code><span class="code-snippet_outer"> public final void acquire(int arg) {</span></code>
<code><span class="code-snippet_outer"> if (!tryAcquire(arg) &amp;&amp;</span></code>
<code><span class="code-snippet_outer"> acquireQueued(addWaiter(Node.EXCLUSIVE), arg))</span></code>
<code><span class="code-snippet_outer"> selfInterrupt();</span></code>
<code><span class="code-snippet_outer"> }</span></code>
</pre>
</section>
<section style="line-height: 1.75em;"><br></section>
</pre>
</div>
</div>
</div>