mirror of
https://github.com/fengxxc/wechatmp2markdown.git
synced 2026-04-21 11:17:42 +08:00
feat: 添加从文件读取功能,添加测试用例文件
This commit is contained in:
4
main.go
4
main.go
@@ -1,7 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import "github.com/fengxxc/wechatmp2markdown/parse"
|
import "github.com/fengxxc/wechatmp2markdown/test"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
parse.Test()
|
test.Test1()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
package parse
|
package parse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -92,6 +95,19 @@ func ParseFromHTMLString(s string) Article {
|
|||||||
return ParseFromReader(strings.NewReader(s))
|
return ParseFromReader(strings.NewReader(s))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ParseFromHTMLFile(filepath string) Article {
|
||||||
|
file, err := os.Open(filepath)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
content, err2 := ioutil.ReadAll(file)
|
||||||
|
if err2 != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
return ParseFromReader(bytes.NewReader(content))
|
||||||
|
}
|
||||||
|
|
||||||
func removeBrAndBlank(s string) string {
|
func removeBrAndBlank(s string) string {
|
||||||
regstr := "\\s{2,}"
|
regstr := "\\s{2,}"
|
||||||
reg, _ := regexp.Compile(regstr)
|
reg, _ := regexp.Compile(regstr)
|
||||||
@@ -104,68 +120,3 @@ func removeBrAndBlank(s string) string {
|
|||||||
}
|
}
|
||||||
return strings.Replace(string(sb), "\n", " ", -1)
|
return strings.Replace(string(sb), "\n", " ", -1)
|
||||||
}
|
}
|
||||||
|
|
||||||
var testHTML = `
|
|
||||||
<div>
|
|
||||||
<div id="img-content" class="rich_media_wrp">
|
|
||||||
<h1 class="rich_media_title" id="activity-name">终于有人喷我了!</h1>
|
|
||||||
<div id="meta_content" class="rich_media_meta_list">
|
|
||||||
<span id="copyright_logo" class="wx_tap_link js_wx_tap_highlight rich_media_meta icon_appmsg_tag appmsg_title_tag weui-wa-hotarea" wah-hotarea="click">原创</span>
|
|
||||||
<span class="rich_media_meta rich_media_meta_text">
|
|
||||||
<span role="link" id="js_author_name" class="wx_tap_link js_wx_tap_highlight weui-wa-hotarea" datarewardsn="" datatimestamp="" datacanreward="0" wah-hotarea="click">闪客sun</span>
|
|
||||||
</span>
|
|
||||||
<span class="rich_media_meta rich_media_meta_nickname" id="profileBt" wah-hotarea="click">
|
|
||||||
<a href="javascript:void(0);" class="wx_tap_link js_wx_tap_highlight weui-wa-hotarea" id="js_name">
|
|
||||||
低并发编程 </a>
|
|
||||||
<div id="js_profile_qrcode" aria-hidden="true" class="profile_container" style="display:none;" wah-hotarea="click">
|
|
||||||
<div class="profile_inner">
|
|
||||||
<strong class="profile_nickname">低并发编程</strong>
|
|
||||||
<img class="profile_avatar" id="js_profile_qrcode_img" src="" alt="">
|
|
||||||
<p class="profile_meta">
|
|
||||||
<label class="profile_meta_label">微信号</label>
|
|
||||||
<span class="profile_meta_value">dibingfa</span>
|
|
||||||
</p>
|
|
||||||
<p class="profile_meta">
|
|
||||||
<label class="profile_meta_label">功能介绍</label>
|
|
||||||
<span class="profile_meta_value">战略上藐视技术,战术上重视技术</span>
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
<span class="profile_arrow_wrp" id="js_profile_arrow_wrp">
|
|
||||||
<i class="profile_arrow arrow_out"></i>
|
|
||||||
<i class="profile_arrow arrow_in"></i>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</span>
|
|
||||||
<em id="publish_time" class="rich_media_meta rich_media_meta_text">2021-11-26</em>
|
|
||||||
</div>
|
|
||||||
<div id="js_tags" class="article-tag__list single-tag__wrp js_single js_wx_tap_highlight wx_tap_card" data-len="1" role="link" aria-labelledby="js_article-tag-card__left" aria-describedby="js_article-tag-card__right" wah-hotarea="click">
|
|
||||||
<span aria-hidden="true" id="js_article-tag-card__left" class="article-tag-card__left">
|
|
||||||
<span class="article-tag-card__title">收录于话题</span>
|
|
||||||
<span class="article-tag__item-wrp no-active js_tag" data-url="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=Mzk0MjE3NDE0Ng==&action=getalbum&album_id=1645521656368381958#wechat_redirect" data-tag_id="" data-album_id="1645521656368381958" data-tag_source="4">
|
|
||||||
<span class="article-tag__item">#随便聊聊</span>
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
<span aria-hidden="true" id="js_article-tag-card__right" class="article-tag-card__right">49个<span class="weui-hidden_abs">内容</span></span>
|
|
||||||
</div>
|
|
||||||
<div class="rich_media_content " id="js_content" style="visibility: visible;">
|
|
||||||
<p data-mpa-powered-by="yiban.io"><span style="font-size: 16px;letter-spacing: 0.5px;background-color: transparent;caret-color: var(--weui-BRAND);">写公众号一年了,一直盼着能有人喷喷我,今天终于被我碰到了!</span><br></p>
|
|
||||||
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;"></span></section>
|
|
||||||
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;">还是我的一位读者发现的,在推特上,于是分享给了我。<br></span></section>
|
|
||||||
<section style="line-height: 1.5em;text-align: center;"><img class="rich_pages wxw-img" data-galleryid="" data-ratio="1.0826306913996628" data-s="300,640" data-src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0"></section>
|
|
||||||
<section style="line-height: 1.5em;">
|
|
||||||
<span style="letter-spacing: 0.5px;font-size: 16px;">这是喷我最近的一个新系列,</span>
|
|
||||||
<a target="_blank" href="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=Mzk0MjE3NDE0Ng==&action=getalbum&album_id=2123743679373688834#wechat_redirect" textvalue="你管这破玩意叫操作系统源码" linktype="text" imgurl="" imgdata="null" tab="innerlink" data-linktype="2" style="letter-spacing: 0.5px;font-size: 16px;" wah-hotarea="click">
|
|
||||||
<span style="letter-spacing: 0.5px;font-size: 16px;">你管这破玩意叫操作系统源码</span>
|
|
||||||
</a>
|
|
||||||
<span style="letter-spacing: 0.5px;font-size: 16px;">,正愁找不到借口推广一波呢,这不就给我来素材了。<br></span>
|
|
||||||
</section>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
`
|
|
||||||
|
|
||||||
func Test() {
|
|
||||||
res := ParseFromHTMLString(testHTML)
|
|
||||||
fmt.Println("---------------------------------------")
|
|
||||||
fmt.Printf("%+v\n", res)
|
|
||||||
}
|
|
||||||
|
|||||||
13
test/test1.go
Normal file
13
test/test1.go
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
package test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/fengxxc/wechatmp2markdown/parse"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Test1() {
|
||||||
|
res := parse.ParseFromHTMLFile("./test/test1.html")
|
||||||
|
fmt.Println("-------------------test1.html-------------------")
|
||||||
|
fmt.Printf("%+v\n", res)
|
||||||
|
}
|
||||||
69
test/test1.html
Normal file
69
test/test1.html
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>test1</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<div id="img-content" class="rich_media_wrp">
|
||||||
|
<h1 class="rich_media_title" id="activity-name">这里是文章标题</h1>
|
||||||
|
<div id="meta_content" class="rich_media_meta_list">
|
||||||
|
<span id="copyright_logo" class="wx_tap_link js_wx_tap_highlight rich_media_meta icon_appmsg_tag appmsg_title_tag weui-wa-hotarea" wah-hotarea="click">原创</span>
|
||||||
|
<span class="rich_media_meta rich_media_meta_text">
|
||||||
|
<span role="link" id="js_author_name" class="wx_tap_link js_wx_tap_highlight weui-wa-hotarea" datarewardsn="" datatimestamp="" datacanreward="0" wah-hotarea="click">这里是作者</span>
|
||||||
|
</span>
|
||||||
|
<span class="rich_media_meta rich_media_meta_nickname" id="profileBt" wah-hotarea="click">
|
||||||
|
<a href="javascript:void(0);" class="wx_tap_link js_wx_tap_highlight weui-wa-hotarea" id="js_name">
|
||||||
|
这里是公众号名 </a>
|
||||||
|
<div id="js_profile_qrcode" aria-hidden="true" class="profile_container" style="display:none;" wah-hotarea="click">
|
||||||
|
<div class="profile_inner">
|
||||||
|
<strong class="profile_nickname">这里是公众号名</strong>
|
||||||
|
<img class="profile_avatar" id="js_profile_qrcode_img" src="" alt="">
|
||||||
|
<p class="profile_meta">
|
||||||
|
<label class="profile_meta_label">微信号</label>
|
||||||
|
<span class="profile_meta_value">这里是微信号</span>
|
||||||
|
</p>
|
||||||
|
<p class="profile_meta">
|
||||||
|
<label class="profile_meta_label">功能介绍</label>
|
||||||
|
<span class="profile_meta_value">这里是功能介绍</span>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<span class="profile_arrow_wrp" id="js_profile_arrow_wrp">
|
||||||
|
<i class="profile_arrow arrow_out"></i>
|
||||||
|
<i class="profile_arrow arrow_in"></i>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</span>
|
||||||
|
<em id="publish_time" class="rich_media_meta rich_media_meta_text">2021-01-01</em>
|
||||||
|
</div>
|
||||||
|
<div id="js_tags" class="article-tag__list single-tag__wrp js_single js_wx_tap_highlight wx_tap_card" data-len="1" role="link" aria-labelledby="js_article-tag-card__left" aria-describedby="js_article-tag-card__right" wah-hotarea="click">
|
||||||
|
<span aria-hidden="true" id="js_article-tag-card__left" class="article-tag-card__left">
|
||||||
|
<span class="article-tag-card__title">收录于话题</span>
|
||||||
|
<span class="article-tag__item-wrp no-active js_tag" data-url="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=Mzk0MjE3NDE0Ng==&action=getalbum&album_id=1645521656368381958#wechat_redirect" data-tag_id="" data-album_id="1645521656368381958" data-tag_source="4">
|
||||||
|
<span class="article-tag__item">#话题1</span>
|
||||||
|
</span>
|
||||||
|
</span>
|
||||||
|
<span aria-hidden="true" id="js_article-tag-card__right" class="article-tag-card__right">69个<span class="weui-hidden_abs">内容</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="rich_media_content " id="js_content" style="visibility: visible;">
|
||||||
|
<p data-mpa-powered-by="yiban.io"><span style="font-size: 16px;letter-spacing: 0.5px;background-color: transparent;caret-color: var(--weui-BRAND);">正文第一行</span><br></p>
|
||||||
|
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;"></span></section>
|
||||||
|
<section style="line-height: 1.5em;"><span style="letter-spacing: 0.5px;font-size: 16px;">正文第二行。<br></span></section>
|
||||||
|
<section style="line-height: 1.5em;text-align: center;"><img class="rich_pages wxw-img" data-galleryid="" data-ratio="1.0826306913996628" data-s="300,640" data-src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png" data-type="png" data-w="593" style="box-shadow: rgb(210, 210, 210) 0em 0em 0.5em 0px; font-size: 17px; width: 346px !important; height: auto !important; visibility: visible !important;" _width="346px" src="https://mmbiz.qpic.cn/mmbiz_png/GLeh42uInXRdNibLb2hf6QnMnWgic4Nm0KhCmicJibxESMoGfbuMrXbQB7lrYFSJPlBeGaJyciaavIBN8NLwESxia7cA/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1" crossorigin="anonymous" alt="图片" data-fail="0"></section>
|
||||||
|
<section style="line-height: 1.5em;">
|
||||||
|
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行,part1,文本</span>
|
||||||
|
<a target="_blank" href="https://mp.weixin.qq.com/mp/appmsgalbum?__biz=Mzk0MjE3NDE0Ng==&action=getalbum&album_id=2123743679373688834#wechat_redirect" textvalue="你管这破玩意叫操作系统源码" linktype="text" imgurl="" imgdata="null" tab="innerlink" data-linktype="2" style="letter-spacing: 0.5px;font-size: 16px;" wah-hotarea="click">
|
||||||
|
<span style="letter-spacing: 0.5px;font-size: 16px;">正文第三行,part2,链接</span>
|
||||||
|
</a>
|
||||||
|
<span style="letter-spacing: 0.5px;font-size: 16px;">,正文第三行,part3,文本。<br></span>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Reference in New Issue
Block a user