mirror of
https://github.com/fengxxc/wechatmp2markdown.git
synced 2026-03-02 10:40:47 +08:00
feat: 标题、链接、图片、文本(加粗、倾斜)、代码块的解析和格式化
This commit is contained in:
94
format/format.go
Normal file
94
format/format.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package format
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
|
||||
"github.com/fengxxc/wechatmp2markdown/parse"
|
||||
)
|
||||
|
||||
func Format(article parse.Article) string {
|
||||
var result string
|
||||
var titleMdStr string = formatTitle(article.Title)
|
||||
result += titleMdStr
|
||||
var metaMdStr string = formatMeta(article.Meta)
|
||||
result += metaMdStr
|
||||
var tagsMdStr string = formatTags(article.Tags)
|
||||
result += tagsMdStr
|
||||
var content string = formatContent(article.Content)
|
||||
result += content
|
||||
return result
|
||||
}
|
||||
|
||||
func formatTitle(piece parse.Piece) string {
|
||||
var prefix string
|
||||
level, _ := strconv.Atoi(piece.Attrs["level"])
|
||||
for i := 0; i < level; i++ {
|
||||
prefix += "#"
|
||||
}
|
||||
return prefix + " " + piece.Val.(string) + " \n"
|
||||
}
|
||||
|
||||
func formatMeta(meta string) string {
|
||||
return meta + " \n" // TODO
|
||||
}
|
||||
|
||||
func formatTags(tags string) string {
|
||||
return tags + " \n" // TODO
|
||||
}
|
||||
|
||||
func formatContent(blocks []parse.Paragraph) string {
|
||||
var contentMdStr string
|
||||
for _, block := range blocks {
|
||||
for _, piece := range block.Pieces {
|
||||
var pieceMdStr string
|
||||
switch piece.Type {
|
||||
case parse.HEADER:
|
||||
pieceMdStr = formatTitle(piece)
|
||||
case parse.LINK:
|
||||
pieceMdStr = formatLink(piece)
|
||||
case parse.NORMAL_TEXT:
|
||||
pieceMdStr = piece.Val.(string)
|
||||
case parse.BOLD_TEXT:
|
||||
pieceMdStr = "**" + piece.Val.(string) + "**"
|
||||
case parse.ITALIC_TEXT:
|
||||
pieceMdStr = "*" + piece.Val.(string) + "*"
|
||||
case parse.BOLD_ITALIC_TEXT:
|
||||
pieceMdStr = "***" + piece.Val.(string) + "***"
|
||||
case parse.IMAGE:
|
||||
pieceMdStr = formatImage(piece)
|
||||
case parse.TABLE:
|
||||
// TODO
|
||||
case parse.CODE_INLINE:
|
||||
case parse.CODE_BLOCK:
|
||||
pieceMdStr = formatCodeBlock(piece)
|
||||
case parse.BLOCK_QUOTES:
|
||||
case parse.O_LIST:
|
||||
case parse.U_LIST:
|
||||
case parse.HR:
|
||||
}
|
||||
contentMdStr += pieceMdStr
|
||||
}
|
||||
contentMdStr += " \n"
|
||||
}
|
||||
return contentMdStr
|
||||
}
|
||||
|
||||
func formatCodeBlock(piece parse.Piece) string {
|
||||
var codeMdStr string
|
||||
codeMdStr += "```\n"
|
||||
codeRows := piece.Val.([]string)
|
||||
for _, row := range codeRows {
|
||||
codeMdStr += row + "\n"
|
||||
}
|
||||
codeMdStr += "``` \n"
|
||||
return codeMdStr
|
||||
}
|
||||
|
||||
func formatImage(piece parse.Piece) string {
|
||||
return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\")"
|
||||
}
|
||||
|
||||
func formatLink(piece parse.Piece) string {
|
||||
var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ")"
|
||||
return linkMdStr
|
||||
}
|
||||
@@ -1,33 +1,45 @@
|
||||
package parse
|
||||
|
||||
type Article struct {
|
||||
title string
|
||||
meta string
|
||||
tags string
|
||||
content []Block
|
||||
Title Piece
|
||||
Meta string
|
||||
Tags string
|
||||
Content []Paragraph
|
||||
}
|
||||
|
||||
type Block struct {
|
||||
tokens []Token
|
||||
type Header struct {
|
||||
Level int
|
||||
Text string
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
ttype TokenType
|
||||
text string
|
||||
attrs map[string]string
|
||||
type Paragraph struct {
|
||||
Pieces []Piece
|
||||
}
|
||||
|
||||
type TokenType int32
|
||||
// go不资瓷泛型可真是难受...
|
||||
type Value interface{}
|
||||
|
||||
type Piece struct {
|
||||
Type PieceType
|
||||
Val Value
|
||||
Attrs map[string]string
|
||||
}
|
||||
|
||||
type PieceType int32
|
||||
|
||||
const (
|
||||
TITLE TokenType = iota // 标题
|
||||
LINK // 链接
|
||||
NORMAL_TEXT // 文字
|
||||
STRONG_TEXT // 强调文字
|
||||
ITALIC_TEXT // 斜体文字
|
||||
IMAGE // 图片
|
||||
TABLE // 表格
|
||||
CODE_INLINE // 代码 内联
|
||||
CODE_BLOCK // 代码 块
|
||||
CITE // 引用
|
||||
HEADER PieceType = iota // 标题
|
||||
LINK // 链接
|
||||
NORMAL_TEXT // 文字
|
||||
BOLD_TEXT // 粗体文字
|
||||
ITALIC_TEXT // 斜体文字
|
||||
BOLD_ITALIC_TEXT // 粗斜体
|
||||
IMAGE // 图片
|
||||
TABLE // 表格
|
||||
CODE_INLINE // 代码 内联
|
||||
CODE_BLOCK // 代码 块
|
||||
BLOCK_QUOTES // 引用
|
||||
O_LIST // 有序列表
|
||||
U_LIST // 无序列表
|
||||
HR // 分隔线
|
||||
)
|
||||
|
||||
@@ -8,37 +8,75 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
func parseSection(s *goquery.Selection) Block {
|
||||
func parseSection(s *goquery.Selection) Paragraph {
|
||||
// fmt.Printf("s.Length() = %d\n", s.Length())
|
||||
// fmt.Printf("s.Size() = %d\n", s.Size())
|
||||
// var tokens = make([]Token, s.Size())
|
||||
var tokens []Token
|
||||
// var piece = make([]Token, s.Size())
|
||||
var piece []Piece
|
||||
s.Children().Each(func(i int, s *goquery.Selection) {
|
||||
var t Token
|
||||
var p Piece
|
||||
attr := make(map[string]string)
|
||||
if s.Is("span") {
|
||||
t = Token{NORMAL_TEXT, s.Text(), nil}
|
||||
p = Piece{NORMAL_TEXT, s.Text(), nil}
|
||||
} else if s.Is("a") {
|
||||
attr["href"], _ = s.Attr("href")
|
||||
t = Token{LINK, removeBrAndBlank(s.Text()), attr}
|
||||
p = Piece{LINK, removeBrAndBlank(s.Text()), attr}
|
||||
} else if s.Is("img") {
|
||||
attr["src"], _ = s.Attr("src")
|
||||
t = Token{IMAGE, "", attr}
|
||||
attr["alt"], _ = s.Attr("alt")
|
||||
attr["title"], _ = s.Attr("title")
|
||||
p = Piece{IMAGE, "", attr}
|
||||
} else if s.Is("ol") {
|
||||
// TODO
|
||||
} else if s.Is("ul") {
|
||||
// TODO
|
||||
} else {
|
||||
t = Token{NORMAL_TEXT, s.Text(), nil}
|
||||
p = Piece{NORMAL_TEXT, s.Text(), nil}
|
||||
// TODO
|
||||
}
|
||||
// fmt.Printf("i = %d\n", i)
|
||||
// fmt.Printf("%+v\n", t)
|
||||
// tokens[i] = t
|
||||
tokens = append(tokens, t)
|
||||
piece = append(piece, p)
|
||||
})
|
||||
return Block{tokens}
|
||||
return Paragraph{piece}
|
||||
}
|
||||
|
||||
func parseHeader(s *goquery.Selection) Paragraph {
|
||||
var level int
|
||||
switch {
|
||||
case s.Is("h1"):
|
||||
level = 1
|
||||
case s.Is("h2"):
|
||||
level = 2
|
||||
case s.Is("h3"):
|
||||
level = 3
|
||||
case s.Is("h4"):
|
||||
level = 4
|
||||
case s.Is("h5"):
|
||||
level = 5
|
||||
case s.Is("h6"):
|
||||
level = 6
|
||||
}
|
||||
fmt.Println("***********" + strconv.Itoa(level))
|
||||
attr := map[string]string{"level": strconv.Itoa(level)}
|
||||
p := Piece{HEADER, removeBrAndBlank(s.Text()), attr}
|
||||
return Paragraph{[]Piece{p}}
|
||||
}
|
||||
|
||||
func parsePre(s *goquery.Selection) Paragraph {
|
||||
var codeRows []string
|
||||
s.Find("code").Each(func(i int, s *goquery.Selection) {
|
||||
codeRows = append(codeRows, s.Text())
|
||||
})
|
||||
p := Piece{CODE_BLOCK, codeRows, nil}
|
||||
return Paragraph{[]Piece{p}}
|
||||
}
|
||||
|
||||
func ParseFromReader(r io.Reader) Article {
|
||||
@@ -52,41 +90,47 @@ func ParseFromReader(r io.Reader) Article {
|
||||
// 标题
|
||||
title := mainContent.Find("#activity-name").Text()
|
||||
fmt.Println(title)
|
||||
article.title = title
|
||||
attr := map[string]string{"level": "1"}
|
||||
article.Title = Piece{HEADER, title, attr}
|
||||
|
||||
// meta 细节待完善
|
||||
meta := mainContent.Find("#meta_content").Text()
|
||||
meta = removeBrAndBlank(meta)
|
||||
fmt.Println(meta)
|
||||
article.meta = meta
|
||||
article.Meta = meta
|
||||
|
||||
// tags 细节待完善
|
||||
tags := mainContent.Find("#js_tags").Text()
|
||||
tags = removeBrAndBlank(tags)
|
||||
fmt.Println(tags)
|
||||
article.tags = tags
|
||||
article.Tags = tags
|
||||
|
||||
// content
|
||||
// section[style="line-height: 1.5em;"]>span,a => 一般段落(含文本和超链接)
|
||||
// p[style="line-height: 1.5em;"] => 项目列表(有序/无序)
|
||||
// section[style=".*text-align:center"]>img => 居中段落(图片)
|
||||
content := mainContent.Find("#js_content")
|
||||
var sections []Block
|
||||
content.Find("section,p").Each(func(i int, s *goquery.Selection) {
|
||||
var sections []Paragraph
|
||||
content.Children().Each(func(i int, s *goquery.Selection) {
|
||||
fmt.Println(s.Text())
|
||||
// fmt.Println(s.Attr("style"))
|
||||
var block Block
|
||||
if s.Is("p") {
|
||||
block = parseSection(s)
|
||||
} else if s.Is("section") {
|
||||
block = parseSection(s)
|
||||
} else {
|
||||
var paragraph Paragraph
|
||||
if s.Is("pre") || s.Is("section.code-snippet__fix") {
|
||||
// 代码块
|
||||
paragraph = parsePre(s)
|
||||
} else if s.Is("p") || s.Is("section") {
|
||||
paragraph = parseSection(s)
|
||||
} else if s.Is("h1") || s.Is("h2") || s.Is("h3") || s.Is("h4") || s.Is("h5") || s.Is("h6") {
|
||||
paragraph = parseHeader(s)
|
||||
} else if s.Is("ol") {
|
||||
// TODO
|
||||
} else if s.Is("ul") {
|
||||
// TODO
|
||||
}
|
||||
// sections[i] = block
|
||||
sections = append(sections, block)
|
||||
sections = append(sections, paragraph)
|
||||
})
|
||||
article.content = sections
|
||||
article.Content = sections
|
||||
|
||||
return article
|
||||
}
|
||||
|
||||
@@ -2,12 +2,19 @@ package test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
|
||||
"github.com/fengxxc/wechatmp2markdown/format"
|
||||
"github.com/fengxxc/wechatmp2markdown/parse"
|
||||
)
|
||||
|
||||
func Test1() {
|
||||
res := parse.ParseFromHTMLFile("./test/test1.html")
|
||||
fmt.Println("-------------------test1.html-------------------")
|
||||
fmt.Printf("%+v\n", res)
|
||||
var articleStruct parse.Article = parse.ParseFromHTMLFile("./test/test1.html")
|
||||
fmt.Println("-------------------test1.html parse-------------------")
|
||||
fmt.Printf("%+v\n", articleStruct)
|
||||
|
||||
fmt.Println("-------------------test1.html format-------------------")
|
||||
var mdString string = format.Format(articleStruct)
|
||||
fmt.Print(mdString)
|
||||
ioutil.WriteFile("./test/test1_target.md", []byte(mdString), 0644)
|
||||
}
|
||||
|
||||
@@ -53,14 +53,82 @@
|
||||
<p data-mpa-powered-by="yiban.io"><span style="font-size: 16px;letter-spacing: 0.5px;background-color: transparent;caret-color: var(--weui-BRAND);">正文第一行</span><br></p>
|
||||
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;"></span></section>
|
||||
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;">正文第二行。<br></span></section>
|
||||
<section style="line-height: 1.5em;text-align: center;"><img class="rich_pages wxw-img" data-galleryid="" data-ratio="1.0826306913996628" data-s="300,640" data-src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0"></section>
|
||||
<section style="line-height: 1.5em;text-align: center;"><img class="rich_pages wxw-img" data-galleryid="" data-ratio="1.0826306913996628" data-s="300,640" data-src="<img class="rich_pages wxw-img" data-galleryid="" data-ratio="0.625" data-s="300,640" data-src="https://mmbiz.qpic.cn/mmbiz_jpg/Z6bicxIx5naLuefnXicbRG06t9JMcnk5ZfO77yyhu2Mtv9037eWJnq0H5ALU1jibAUEm21mUsx0cq11B6oGpGE1jQ/640?wx_fmt=jpeg" data-type="jpeg" data-w="1024" style="width: 677px !important; height: auto !important; visibility: visible !important;" _width="677px" src="https://mmbiz.qpic.cn/mmbiz_jpg/Z6bicxIx5naLuefnXicbRG06t9JMcnk5ZfO77yyhu2Mtv9037eWJnq0H5ALU1jibAUEm21mUsx0cq11B6oGpGE1jQ/640?wx_fmt=jpeg&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0">" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0"></section>
|
||||
<section style="line-height: 1.5em;">
|
||||
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行,part1,文本</span>
|
||||
<a target="_blank" href="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=Mzk0MjE3NDE0Ng==&action=getalbum&album_id=2123743679373688834#wechat_redirect" textvalue="你管这破玩意叫操作系统源码" linktype="text" imgurl="" imgdata="null" tab="innerlink" data-linktype="2" style="letter-spacing: 0.5px;font-size: 16px;" wah-hotarea="click">
|
||||
<a target="_blank" href="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzIzOTU0NTQ0MA==&action=getalbum&album_id=1391790902901014528#wechat_redirect" textvalue="你管这破玩意叫操作系统源码" linktype="text" imgurl="" imgdata="null" tab="innerlink" data-linktype="2" style="letter-spacing: 0.5px;font-size: 16px;" wah-hotarea="click">
|
||||
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行,part2,链接</span>
|
||||
</a>
|
||||
<span style="letter-spacing: 0.5px;font-size: 16px;">,正文第三行,part3,文本。<br></span>
|
||||
</section>
|
||||
<h2 style="line-height: 1.75em;">
|
||||
<strong><span style="font-size: 15px;color: rgb(255, 106, 0);">一 JDK层</span></strong>
|
||||
</h2>
|
||||
<h3 style="line-height: 1.75em;">
|
||||
<span style="font-size: 15px;color: rgb(255, 106, 0);">1 AbstractQueuedSynchronizer</span>
|
||||
</h3>
|
||||
<!-- 代行号的代码块 -->
|
||||
<section class="code-snippet__fix code-snippet__js">
|
||||
<ul class="code-snippet__line-index code-snippet__js">
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
<li></li>
|
||||
</ul>
|
||||
<pre class="code-snippet__js" data-lang="cs">
|
||||
<code><span class="code-snippet_outer">代码节选自:java.util.concurrent.locks.ReentrantLock.java</span></code>
|
||||
<code><span class="code-snippet_outer"> /** Synchronizer providing all implementation mechanics */</span></code>
|
||||
<code><span class="code-snippet_outer"> private final Sync sync;</span></code>
|
||||
<code><span class="code-snippet_outer"> /**</span></code>
|
||||
<code><span class="code-snippet_outer"> * Base of synchronization control for this lock. Subclassed</span></code>
|
||||
<code><span class="code-snippet_outer"> * into fair and nonfair versions below. Uses AQS state to</span></code>
|
||||
<code><span class="code-snippet_outer"> * represent the number of holds on the lock.</span></code>
|
||||
<code><span class="code-snippet_outer"> */</span></code>
|
||||
<code><span class="code-snippet_outer"> abstract static class Sync extends AbstractQueuedSynchronizer {</span></code>
|
||||
<code><span class="code-snippet_outer">......</span></code>
|
||||
<code><span class="code-snippet_outer">}</span></code>
|
||||
<code><span class="code-snippet_outer"><br></span></code>
|
||||
<code><span class="code-snippet_outer"> public void lock() {</span></code>
|
||||
<code><span class="code-snippet_outer"> sync.lock();</span></code>
|
||||
<code><span class="code-snippet_outer"> }</span></code>
|
||||
</pre>
|
||||
</section>
|
||||
<pre>
|
||||
<section class="code-snippet__fix code-snippet__js">
|
||||
<ul class="code-snippet__line-index code-snippet__js"><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li><li></li></ul>
|
||||
<pre class="code-snippet__js" data-lang="java">
|
||||
<code><span class="code-snippet_outer"> /**</span></code>
|
||||
<code><span class="code-snippet_outer"> * Acquires in exclusive mode, ignoring interrupts. Implemented</span></code>
|
||||
<code><span class="code-snippet_outer"> * by invoking at least once {@link #tryAcquire},</span></code>
|
||||
<code><span class="code-snippet_outer"> * returning on success. Otherwise the thread is queued, possibly</span></code>
|
||||
<code><span class="code-snippet_outer"> * repeatedly blocking and unblocking, invoking {@link</span></code>
|
||||
<code><span class="code-snippet_outer"> * #tryAcquire} until success. This method can be used</span></code>
|
||||
<code><span class="code-snippet_outer"> * to implement method {@link Lock#lock}.</span></code>
|
||||
<code><span class="code-snippet_outer"> *</span></code>
|
||||
<code><span class="code-snippet_outer"> * @param arg the acquire argument. This value is conveyed to</span></code>
|
||||
<code><span class="code-snippet_outer"> * {@link #tryAcquire} but is otherwise uninterpreted and</span></code>
|
||||
<code><span class="code-snippet_outer"> * can represent anything you like.</span></code>
|
||||
<code><span class="code-snippet_outer"> */</span></code>
|
||||
<code><span class="code-snippet_outer"> public final void acquire(int arg) {</span></code>
|
||||
<code><span class="code-snippet_outer"> if (!tryAcquire(arg) &&</span></code>
|
||||
<code><span class="code-snippet_outer"> acquireQueued(addWaiter(Node.EXCLUSIVE), arg))</span></code>
|
||||
<code><span class="code-snippet_outer"> selfInterrupt();</span></code>
|
||||
<code><span class="code-snippet_outer"> }</span></code>
|
||||
</pre>
|
||||
</section>
|
||||
<section style="line-height: 1.75em;"><br></section>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user