diff --git a/format/format.go b/format/format.go new file mode 100644 index 0000000..cf67422 --- /dev/null +++ b/format/format.go @@ -0,0 +1,94 @@ +package format + +import ( + "strconv" + + "github.com/fengxxc/wechatmp2markdown/parse" +) + +func Format(article parse.Article) string { + var result string + var titleMdStr string = formatTitle(article.Title) + result += titleMdStr + var metaMdStr string = formatMeta(article.Meta) + result += metaMdStr + var tagsMdStr string = formatTags(article.Tags) + result += tagsMdStr + var content string = formatContent(article.Content) + result += content + return result +} + +func formatTitle(piece parse.Piece) string { + var prefix string + level, _ := strconv.Atoi(piece.Attrs["level"]) + for i := 0; i < level; i++ { + prefix += "#" + } + return prefix + " " + piece.Val.(string) + " \n" +} + +func formatMeta(meta string) string { + return meta + " \n" // TODO +} + +func formatTags(tags string) string { + return tags + " \n" // TODO +} + +func formatContent(blocks []parse.Paragraph) string { + var contentMdStr string + for _, block := range blocks { + for _, piece := range block.Pieces { + var pieceMdStr string + switch piece.Type { + case parse.HEADER: + pieceMdStr = formatTitle(piece) + case parse.LINK: + pieceMdStr = formatLink(piece) + case parse.NORMAL_TEXT: + pieceMdStr = piece.Val.(string) + case parse.BOLD_TEXT: + pieceMdStr = "**" + piece.Val.(string) + "**" + case parse.ITALIC_TEXT: + pieceMdStr = "*" + piece.Val.(string) + "*" + case parse.BOLD_ITALIC_TEXT: + pieceMdStr = "***" + piece.Val.(string) + "***" + case parse.IMAGE: + pieceMdStr = formatImage(piece) + case parse.TABLE: + // TODO + case parse.CODE_INLINE: + case parse.CODE_BLOCK: + pieceMdStr = formatCodeBlock(piece) + case parse.BLOCK_QUOTES: + case parse.O_LIST: + case parse.U_LIST: + case parse.HR: + } + contentMdStr += pieceMdStr + } + contentMdStr += " \n" + } + return contentMdStr +} + +func formatCodeBlock(piece parse.Piece) string { + var codeMdStr string + codeMdStr += "```\n" + codeRows := piece.Val.([]string) + for _, row := range codeRows { + codeMdStr += row + "\n" + } + codeMdStr += "``` \n" + return codeMdStr +} + +func formatImage(piece parse.Piece) string { + return "![" + piece.Attrs["alt"] + "](" + piece.Attrs["src"] + " \"" + piece.Attrs["title"] + "\")" +} + +func formatLink(piece parse.Piece) string { + var linkMdStr string = "[" + piece.Val.(string) + "](" + piece.Attrs["href"] + ")" + return linkMdStr +} diff --git a/parse/model.go b/parse/model.go index 56c7d48..2fe0166 100644 --- a/parse/model.go +++ b/parse/model.go @@ -1,33 +1,45 @@ package parse type Article struct { - title string - meta string - tags string - content []Block + Title Piece + Meta string + Tags string + Content []Paragraph } -type Block struct { - tokens []Token +type Header struct { + Level int + Text string } -type Token struct { - ttype TokenType - text string - attrs map[string]string +type Paragraph struct { + Pieces []Piece } -type TokenType int32 +// go不资瓷泛型可真是难受... +type Value interface{} + +type Piece struct { + Type PieceType + Val Value + Attrs map[string]string +} + +type PieceType int32 const ( - TITLE TokenType = iota // 标题 - LINK // 链接 - NORMAL_TEXT // 文字 - STRONG_TEXT // 强调文字 - ITALIC_TEXT // 斜体文字 - IMAGE // 图片 - TABLE // 表格 - CODE_INLINE // 代码 内联 - CODE_BLOCK // 代码 块 - CITE // 引用 + HEADER PieceType = iota // 标题 + LINK // 链接 + NORMAL_TEXT // 文字 + BOLD_TEXT // 粗体文字 + ITALIC_TEXT // 斜体文字 + BOLD_ITALIC_TEXT // 粗斜体 + IMAGE // 图片 + TABLE // 表格 + CODE_INLINE // 代码 内联 + CODE_BLOCK // 代码 块 + BLOCK_QUOTES // 引用 + O_LIST // 有序列表 + U_LIST // 无序列表 + HR // 分隔线 ) diff --git a/parse/parse.go b/parse/parse.go index 3a4b58c..0136dd1 100644 --- a/parse/parse.go +++ b/parse/parse.go @@ -8,37 +8,75 @@ import ( "log" "os" "regexp" + "strconv" "strings" "github.com/PuerkitoBio/goquery" ) -func parseSection(s *goquery.Selection) Block { +func parseSection(s *goquery.Selection) Paragraph { // fmt.Printf("s.Length() = %d\n", s.Length()) // fmt.Printf("s.Size() = %d\n", s.Size()) - // var tokens = make([]Token, s.Size()) - var tokens []Token + // var piece = make([]Token, s.Size()) + var piece []Piece s.Children().Each(func(i int, s *goquery.Selection) { - var t Token + var p Piece attr := make(map[string]string) if s.Is("span") { - t = Token{NORMAL_TEXT, s.Text(), nil} + p = Piece{NORMAL_TEXT, s.Text(), nil} } else if s.Is("a") { attr["href"], _ = s.Attr("href") - t = Token{LINK, removeBrAndBlank(s.Text()), attr} + p = Piece{LINK, removeBrAndBlank(s.Text()), attr} } else if s.Is("img") { attr["src"], _ = s.Attr("src") - t = Token{IMAGE, "", attr} + attr["alt"], _ = s.Attr("alt") + attr["title"], _ = s.Attr("title") + p = Piece{IMAGE, "", attr} + } else if s.Is("ol") { + // TODO + } else if s.Is("ul") { + // TODO } else { - t = Token{NORMAL_TEXT, s.Text(), nil} + p = Piece{NORMAL_TEXT, s.Text(), nil} // TODO } // fmt.Printf("i = %d\n", i) // fmt.Printf("%+v\n", t) // tokens[i] = t - tokens = append(tokens, t) + piece = append(piece, p) }) - return Block{tokens} + return Paragraph{piece} +} + +func parseHeader(s *goquery.Selection) Paragraph { + var level int + switch { + case s.Is("h1"): + level = 1 + case s.Is("h2"): + level = 2 + case s.Is("h3"): + level = 3 + case s.Is("h4"): + level = 4 + case s.Is("h5"): + level = 5 + case s.Is("h6"): + level = 6 + } + fmt.Println("***********" + strconv.Itoa(level)) + attr := map[string]string{"level": strconv.Itoa(level)} + p := Piece{HEADER, removeBrAndBlank(s.Text()), attr} + return Paragraph{[]Piece{p}} +} + +func parsePre(s *goquery.Selection) Paragraph { + var codeRows []string + s.Find("code").Each(func(i int, s *goquery.Selection) { + codeRows = append(codeRows, s.Text()) + }) + p := Piece{CODE_BLOCK, codeRows, nil} + return Paragraph{[]Piece{p}} } func ParseFromReader(r io.Reader) Article { @@ -52,41 +90,47 @@ func ParseFromReader(r io.Reader) Article { // 标题 title := mainContent.Find("#activity-name").Text() fmt.Println(title) - article.title = title + attr := map[string]string{"level": "1"} + article.Title = Piece{HEADER, title, attr} // meta 细节待完善 meta := mainContent.Find("#meta_content").Text() meta = removeBrAndBlank(meta) fmt.Println(meta) - article.meta = meta + article.Meta = meta // tags 细节待完善 tags := mainContent.Find("#js_tags").Text() tags = removeBrAndBlank(tags) fmt.Println(tags) - article.tags = tags + article.Tags = tags // content // section[style="line-height: 1.5em;"]>span,a => 一般段落(含文本和超链接) // p[style="line-height: 1.5em;"] => 项目列表(有序/无序) // section[style=".*text-align:center"]>img => 居中段落(图片) content := mainContent.Find("#js_content") - var sections []Block - content.Find("section,p").Each(func(i int, s *goquery.Selection) { + var sections []Paragraph + content.Children().Each(func(i int, s *goquery.Selection) { fmt.Println(s.Text()) // fmt.Println(s.Attr("style")) - var block Block - if s.Is("p") { - block = parseSection(s) - } else if s.Is("section") { - block = parseSection(s) - } else { + var paragraph Paragraph + if s.Is("pre") || s.Is("section.code-snippet__fix") { + // 代码块 + paragraph = parsePre(s) + } else if s.Is("p") || s.Is("section") { + paragraph = parseSection(s) + } else if s.Is("h1") || s.Is("h2") || s.Is("h3") || s.Is("h4") || s.Is("h5") || s.Is("h6") { + paragraph = parseHeader(s) + } else if s.Is("ol") { + // TODO + } else if s.Is("ul") { // TODO } // sections[i] = block - sections = append(sections, block) + sections = append(sections, paragraph) }) - article.content = sections + article.Content = sections return article } diff --git a/test/test1.go b/test/test1.go index 53331f8..92f513b 100644 --- a/test/test1.go +++ b/test/test1.go @@ -2,12 +2,19 @@ package test import ( "fmt" + "io/ioutil" + "github.com/fengxxc/wechatmp2markdown/format" "github.com/fengxxc/wechatmp2markdown/parse" ) func Test1() { - res := parse.ParseFromHTMLFile("./test/test1.html") - fmt.Println("-------------------test1.html-------------------") - fmt.Printf("%+v\n", res) + var articleStruct parse.Article = parse.ParseFromHTMLFile("./test/test1.html") + fmt.Println("-------------------test1.html parse-------------------") + fmt.Printf("%+v\n", articleStruct) + + fmt.Println("-------------------test1.html format-------------------") + var mdString string = format.Format(articleStruct) + fmt.Print(mdString) + ioutil.WriteFile("./test/test1_target.md", []byte(mdString), 0644) } diff --git a/test/test1.html b/test/test1.html index 3bd0faa..fac4203 100644 --- a/test/test1.html +++ b/test/test1.html @@ -53,14 +53,82 @@

正文第一行

‍‍
正文第二行。
-
图片
+
图片" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0">
正文第三行,part1,文本 - + 正文第三行,part2,链接 ,正文第三行,part3,文本。
+

+ 一 JDK层 +

+

+ 1 AbstractQueuedSynchronizer +

+ +
+ +
+                        代码节选自:java.util.concurrent.locks.ReentrantLock.java
+                         /** Synchronizer providing all implementation mechanics */
+                            private final Sync sync;
+                            /**
+                             * Base of synchronization control for this lock. Subclassed
+                             * into fair and nonfair versions below. Uses AQS state to
+                             * represent the number of holds on the lock.
+                             */
+                            abstract static class Sync extends AbstractQueuedSynchronizer {
+                        ......
+                        }
+                        
+ public void lock() { + sync.lock(); + } +
+
+
+                    
+ +
+                                /**
+                                 * Acquires in exclusive mode, ignoring interrupts.  Implemented
+                                 * by invoking at least once {@link #tryAcquire},
+                                 * returning on success.  Otherwise the thread is queued, possibly
+                                 * repeatedly blocking and unblocking, invoking {@link
+                                 * #tryAcquire} until success.  This method can be used
+                                 * to implement method {@link Lock#lock}.
+                                 *
+                                 * @param arg the acquire argument.  This value is conveyed to
+                                 *        {@link #tryAcquire} but is otherwise uninterpreted and
+                                 *        can represent anything you like.
+                                 */
+                                public final void acquire(int arg) {
+                                    if (!tryAcquire(arg) &&
+                                        acquireQueued(addWaiter(Node.EXCLUSIVE), arg))
+                                        selfInterrupt();
+                                }
+                        
+
+

+