From dfaa0d66052b42e173a21fa0f4543bd66a9c4791 Mon Sep 17 00:00:00 2001 From: fengxxc Date: Mon, 29 Nov 2021 20:18:37 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=A0=87=E9=A2=98?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E9=94=99=E8=AF=AF=EF=BC=8C=E5=8E=BB=E6=8E=89?= =?UTF-8?q?=E9=A6=96=E5=B0=BE=E5=A4=9A=E4=BD=99=E7=9A=84=E7=A9=BA=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- parse/parse.go | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/parse/parse.go b/parse/parse.go index 0136dd1..fbc42e4 100644 --- a/parse/parse.go +++ b/parse/parse.go @@ -2,10 +2,10 @@ package parse import ( "bytes" - "fmt" "io" "io/ioutil" "log" + "net/http" "os" "regexp" "strconv" @@ -15,9 +15,6 @@ import ( ) func parseSection(s *goquery.Selection) Paragraph { - // fmt.Printf("s.Length() = %d\n", s.Length()) - // fmt.Printf("s.Size() = %d\n", s.Size()) - // var piece = make([]Token, s.Size()) var piece []Piece s.Children().Each(func(i int, s *goquery.Selection) { var p Piece @@ -40,9 +37,7 @@ func parseSection(s *goquery.Selection) Paragraph { p = Piece{NORMAL_TEXT, s.Text(), nil} // TODO } - // fmt.Printf("i = %d\n", i) // fmt.Printf("%+v\n", t) - // tokens[i] = t piece = append(piece, p) }) return Paragraph{piece} @@ -64,7 +59,6 @@ func parseHeader(s *goquery.Selection) Paragraph { case s.Is("h6"): level = 6 } - fmt.Println("***********" + strconv.Itoa(level)) attr := map[string]string{"level": strconv.Itoa(level)} p := Piece{HEADER, removeBrAndBlank(s.Text()), attr} return Paragraph{[]Piece{p}} @@ -89,20 +83,17 @@ func ParseFromReader(r io.Reader) Article { // 标题 title := mainContent.Find("#activity-name").Text() - fmt.Println(title) attr := map[string]string{"level": "1"} - article.Title = Piece{HEADER, title, attr} + article.Title = Piece{HEADER, removeBrAndBlank(title), attr} // meta 细节待完善 meta := mainContent.Find("#meta_content").Text() meta = removeBrAndBlank(meta) - fmt.Println(meta) article.Meta = meta // tags 细节待完善 tags := mainContent.Find("#js_tags").Text() tags = removeBrAndBlank(tags) - fmt.Println(tags) article.Tags = tags // content @@ -112,8 +103,6 @@ func ParseFromReader(r io.Reader) Article { content := mainContent.Find("#js_content") var sections []Paragraph content.Children().Each(func(i int, s *goquery.Selection) { - fmt.Println(s.Text()) - // fmt.Println(s.Attr("style")) var paragraph Paragraph if s.Is("pre") || s.Is("section.code-snippet__fix") { // 代码块 @@ -152,6 +141,18 @@ func ParseFromHTMLFile(filepath string) Article { return ParseFromReader(bytes.NewReader(content)) } +func ParseFromURL(url string) Article { + res, err := http.Get(url) + if err != nil { + panic(err) + } + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("get from url %s error: %d %s", url, res.StatusCode, res.Status) + } + return ParseFromReader(res.Body) +} + func removeBrAndBlank(s string) string { regstr := "\\s{2,}" reg, _ := regexp.Compile(regstr)