package parse import ( "bytes" "encoding/base64" "io" "log" "net/http" "os" "regexp" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" ) func parseSection(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { var pieces []Piece pieces = append(pieces, Piece{BR, nil, nil}) s.Contents().Each(func(i int, sc *goquery.Selection) { attr := make(map[string]string) if sc.Is("a") { attr["href"], _ = sc.Attr("href") pieces = append(pieces, Piece{LINK, removeBrAndBlank(sc.Text()), attr}) } else if sc.Is("img") { attr["src"], _ = sc.Attr("data-src") attr["alt"], _ = sc.Attr("alt") attr["title"], _ = sc.Attr("title") switch imagePolicy { case IMAGE_POLICY_URL: pieces = append(pieces, Piece{IMAGE, nil, attr}, Piece{BR, nil, nil}) case IMAGE_POLICY_SAVE: image := fetchImgFile(attr["src"]) pieces = append(pieces, Piece{IMAGE, image, attr}, Piece{BR, nil, nil}) case IMAGE_POLICY_BASE64: fallthrough default: base64Image := img2base64(fetchImgFile(attr["src"])) pieces = append(pieces, Piece{IMAGE_BASE64, base64Image, attr}, Piece{BR, nil, nil}) } } else if sc.Is("ol") { pieces = append(pieces, parseList(sc, O_LIST, imagePolicy)...) } else if sc.Is("ul") { pieces = append(pieces, parseList(sc, U_LIST, imagePolicy)...) } else if sc.Is("pre") || sc.Is("section.code-snippet__fix") { // 代码块 pieces = append(pieces, parsePre(sc)...) } else if sc.Is("p") || sc.Is("section") || sc.Is("span") { pieces = append(pieces, parseSection(sc, imagePolicy)...) } else if sc.Is("h1") || sc.Is("h2") || sc.Is("h3") || sc.Is("h4") || sc.Is("h5") || sc.Is("h6") { pieces = append(pieces, parseHeader(sc)...) } else if sc.Is("blockquote") { pieces = append(pieces, parseBlockQuote(sc, imagePolicy)...) } else if sc.Is("strong") { pieces = append(pieces, parseStrong(sc)...) } else { pieces = append(pieces, Piece{NORMAL_TEXT, sc.Text(), nil}) } }) pieces = append(pieces, Piece{BR, nil, nil}) return pieces } func parseHeader(s *goquery.Selection) []Piece { var level int switch { case s.Is("h1"): level = 1 case s.Is("h2"): level = 2 case s.Is("h3"): level = 3 case s.Is("h4"): level = 4 case s.Is("h5"): level = 5 case s.Is("h6"): level = 6 } attr := map[string]string{"level": strconv.Itoa(level)} p := Piece{HEADER, removeBrAndBlank(s.Text()), attr} return []Piece{p, {BR, nil, nil}} } func parsePre(s *goquery.Selection) []Piece { var codeRows []string s.Find("code").Each(func(i int, sc *goquery.Selection) { codeRows = append(codeRows, sc.Text()) }) p := Piece{CODE_BLOCK, codeRows, nil} return []Piece{p, {BR, nil, nil}} } func parseList(s *goquery.Selection, ptype PieceType, imagePolicy ImagePolicy) []Piece { var list []Piece s.Find("li").Each(func(i int, sc *goquery.Selection) { list = append(list, Piece{ptype, parseSection(sc, imagePolicy), nil}) }) list = append(list, Piece{BR, nil, nil}) return list } func parseBlockQuote(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { var bq []Piece s.Contents().Each(func(i int, sc *goquery.Selection) { bq = append(bq, Piece{BLOCK_QUOTES, parseSection(sc, imagePolicy), nil}) }) bq = append(bq, Piece{BR, nil, nil}) return bq } func parseStrong(s *goquery.Selection) []Piece { var bt []Piece bt = append(bt, Piece{BOLD_TEXT, strings.TrimSpace(s.Text()), nil}) return bt } func parseMeta(s *goquery.Selection) []string { var res []string s.Children().Each(func(i int, sc *goquery.Selection) { if sc.Is("#profileBt") { res = append(res, removeBrAndBlank(sc.Find("#js_name").Text())) } else { style, exists := sc.Attr("style") if !(exists && strings.Contains(style, "display: none;")) { // t := sc.Nodes[0].Data t := strings.TrimSpace(sc.Text()) res = append(res, t) } } }) return res } func ParseFromReader(r io.Reader, imagePolicy ImagePolicy) Article { var article Article doc, err := goquery.NewDocumentFromReader(r) if err != nil { log.Fatal(err) } var mainContent *goquery.Selection = doc.Find("#img-content") // 标题 title := mainContent.Find("#activity-name").Text() attr := map[string]string{"level": "1"} article.Title = Piece{HEADER, removeBrAndBlank(title), attr} // meta meta := mainContent.Find("#meta_content") metastring := parseMeta(meta) article.Meta = metastring // 从js中找到发布时间 re, _ := regexp.Compile("var ct = \"([0-9]+)\"") findstrs := re.FindStringSubmatch(doc.Find("script").Text()) if len(findstrs) > 1 { var createTime string = findstrs[1] timestamp, _ := strconv.Atoi(createTime) time := time.Unix(int64(timestamp), 0) article.Meta = append(article.Meta, time.Format("2006-01-02 15:04")) } // tags 细节待完善 tags := mainContent.Find("#js_tags").Text() tags = removeBrAndBlank(tags) article.Tags = tags // content // section[style="line-height: 1.5em;"]>span,a => 一般段落(含文本和超链接) // p[style="line-height: 1.5em;"] => 项目列表(有序/无序) // section[style=".*text-align:center"]>img => 居中段落(图片) content := mainContent.Find("#js_content") pieces := parseSection(content, imagePolicy) article.Content = pieces return article } func ParseFromHTMLString(s string, imagePolicy ImagePolicy) Article { return ParseFromReader(strings.NewReader(s), imagePolicy) } func ParseFromHTMLFile(filepath string, imagePolicy ImagePolicy) Article { file, err := os.Open(filepath) if err != nil { panic(err) } defer file.Close() content, err2 := io.ReadAll(file) if err2 != nil { panic(err) } return ParseFromReader(bytes.NewReader(content), imagePolicy) } func ParseFromURL(url string, imagePolicy ImagePolicy) Article { res, err := http.Get(url) if err != nil { log.Fatalf(err.Error()) } defer res.Body.Close() if res.StatusCode != 200 { log.Fatalf("get from url %s error: %d %s", url, res.StatusCode, res.Status) } return ParseFromReader(res.Body, imagePolicy) } func removeBrAndBlank(s string) string { regstr := "\\s{2,}" reg, _ := regexp.Compile(regstr) sb := make([]byte, len(s)) copy(sb, s) spc_index := reg.FindStringIndex(string(sb)) //在字符串中搜索 for len(spc_index) > 0 { //找到适配项 sb = append(sb[:spc_index[0]+1], sb[spc_index[1]:]...) //删除多余空格 spc_index = reg.FindStringIndex(string(sb)) //继续在字符串中搜索 } return strings.Replace(string(sb), "\n", " ", -1) } func fetchImgFile(url string) []byte { res, err := http.Get(url) if err != nil { log.Fatalf("get Image from url %s error: %s", url, err.Error()) return nil } defer res.Body.Close() if res.StatusCode != 200 { log.Fatalf("get Image from url %s error: %d %s", url, res.StatusCode, res.Status) } content, err := io.ReadAll(res.Body) if err != nil { log.Fatalf("read image Response error: %s", err.Error()) } return content } func img2base64(content []byte) string { return base64.StdEncoding.EncodeToString(content) } type ImagePolicy int32 const ( IMAGE_POLICY_URL ImagePolicy = iota IMAGE_POLICY_SAVE IMAGE_POLICY_BASE64 ) func ImageArgValue2ImagePolicy(val string) ImagePolicy { var imagePolicy ImagePolicy switch val { case "url": imagePolicy = IMAGE_POLICY_URL case "save": imagePolicy = IMAGE_POLICY_SAVE case "base64": fallthrough default: imagePolicy = IMAGE_POLICY_BASE64 } return imagePolicy }