package parse import ( "bytes" "encoding/base64" "io" "log" "net/http" "os" "regexp" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" ) func parseSection(s *goquery.Selection, imagePolicy ImagePolicy, lastPieceType PieceType) []Piece { var pieces []Piece if lastPieceType == O_LIST || lastPieceType == U_LIST || lastPieceType == NULL || lastPieceType == BLOCK_QUOTES { // pieces = append(pieces, Piece{NULL, nil, nil}) } else { pieces = append(pieces, Piece{BR, nil, nil}) } var _lastPieceType PieceType = NULL s.Contents().Each(func(i int, sc *goquery.Selection) { attr := make(map[string]string) if sc.Is("a") { attr["href"], _ = sc.Attr("href") pieces = append(pieces, Piece{LINK, removeBrAndBlank(sc.Text()), attr}) } else if sc.Is("img") { attr["src"], _ = sc.Attr("data-src") attr["alt"], _ = sc.Attr("alt") attr["title"], _ = sc.Attr("title") switch imagePolicy { case IMAGE_POLICY_URL: pieces = append(pieces, Piece{IMAGE, nil, attr}) case IMAGE_POLICY_SAVE: image := fetchImgFile(attr["src"]) pieces = append(pieces, Piece{IMAGE, image, attr}) case IMAGE_POLICY_BASE64: fallthrough default: base64Image := img2base64(fetchImgFile(attr["src"])) pieces = append(pieces, Piece{IMAGE_BASE64, base64Image, attr}) } } else if sc.Is("ol") { pieces = append(pieces, parseList(sc, O_LIST, imagePolicy)...) } else if sc.Is("ul") { pieces = append(pieces, parseList(sc, U_LIST, imagePolicy)...) } else if sc.Is("pre") || sc.Is("section.code-snippet__fix") { // 代码块 pieces = append(pieces, parsePre(sc)...) } else if sc.Is("span") { pieces = append(pieces, parseSection(sc, imagePolicy, _lastPieceType)...) } else if sc.Is("p") || sc.Is("section") { pieces = append(pieces, parseSection(sc, imagePolicy, _lastPieceType)...) if removeBrAndBlank(sc.Text()) != "" && len(pieces) > 0 && pieces[len(pieces)-1].Type != BR { pieces = append(pieces, Piece{BR, nil, nil}) } } else if sc.Is("h1") || sc.Is("h2") || sc.Is("h3") || sc.Is("h4") || sc.Is("h5") || sc.Is("h6") { pieces = append(pieces, parseHeader(sc)...) } else if sc.Is("blockquote") { pieces = append(pieces, parseBlockQuote(sc, imagePolicy)...) } else if sc.Is("strong") { pieces = append(pieces, parseStrong(sc)...) } else if sc.Is("table") { pieces = append(pieces, parseTable(sc)...) } else { if sc.Text() != "" { pieces = append(pieces, Piece{NORMAL_TEXT, sc.Text(), nil}) } } if len(pieces) > 0 { _lastPieceType = pieces[len(pieces)-1].Type } }) return pieces } func parseHeader(s *goquery.Selection) []Piece { var level int switch { case s.Is("h1"): level = 1 case s.Is("h2"): level = 2 case s.Is("h3"): level = 3 case s.Is("h4"): level = 4 case s.Is("h5"): level = 5 case s.Is("h6"): level = 6 } attr := map[string]string{"level": strconv.Itoa(level)} p := Piece{HEADER, removeBrAndBlank(s.Text()), attr} return []Piece{p} } func parsePre(s *goquery.Selection) []Piece { // print(s.Html()) var codeRows []string s.Find("code").Each(func(i int, sc *goquery.Selection) { var codeLine string = "" sc.Contents().Each(func(i int, sc *goquery.Selection) { if goquery.NodeName(sc) == "br" { codeRows = append(codeRows, codeLine) codeLine = "" } else { codeLine += sc.Text() } }) codeRows = append(codeRows, codeLine) }) p := Piece{CODE_BLOCK, codeRows, nil} return []Piece{p} } func parseList(s *goquery.Selection, ptype PieceType, imagePolicy ImagePolicy) []Piece { var list []Piece s.Find("li").Each(func(i int, sc *goquery.Selection) { list = append(list, Piece{ptype, parseSection(sc, imagePolicy, ptype), nil}) }) return list } func parseBlockQuote(s *goquery.Selection, imagePolicy ImagePolicy) []Piece { var bq []Piece s.Contents().Each(func(i int, sc *goquery.Selection) { bq = append(bq, Piece{BLOCK_QUOTES, parseSection(sc, imagePolicy, BLOCK_QUOTES), nil}) }) bq = append(bq, Piece{BR, nil, nil}) return bq } func parseTable(s *goquery.Selection) []Piece { // 先简单粗暴把原生的挪过去 var table []Piece html, _ := s.Html() table = append(table, Piece{TABLE, "