WIP: douban wish list

This commit is contained in:
Simon Ding
2025-02-10 15:07:32 +08:00
parent 80ad9a2a3b
commit e380a624f5
4 changed files with 182 additions and 45 deletions

View File

@@ -2,29 +2,25 @@ package douban
import (
"fmt"
"io"
"net/http"
"polaris/log"
"polaris/pkg/importlist"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
)
const ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
const ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
func ParseDoulist(doulistUrl string) (*importlist.Response, error) {
if !strings.Contains(doulistUrl, "doulist") {
return nil, fmt.Errorf("not doulist")
}
req, err := http.NewRequest("GET", doulistUrl, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", ua)
res, err := http.DefaultClient.Do(req)
res, err := doHttpReq("GET", doulistUrl, nil)
if err != nil {
return nil, err
}
@@ -85,13 +81,8 @@ func ParseDoulist(doulistUrl string) (*importlist.Response, error) {
func parseDetailPage(url string) (string, error) {
println(url)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", ua)
res, err := http.DefaultClient.Do(req)
res, err := doHttpReq("GET", url, nil)
if err != nil {
return "", err
}
@@ -115,3 +106,80 @@ func parseDetailPage(url string) (string, error) {
_ = doc
return "", nil
}
func NewDoubanWishlist(personId string) *DoubanWishlist {
return &DoubanWishlist{PersonId: personId}
}
type DoubanWishlist struct {
PersonId string
}
const wishlistUrl = "https://movie.douban.com/people/%s/wish?sort=time&start=%d&mode=grid&tags_sort=count"
func (d *DoubanWishlist) GetWishlist(page int) (*importlist.Response, error) {
c := colly.NewCollector(colly.UserAgent(ua))
c.Limit(&colly.LimitRule{
DomainRegexp: "*",
Delay: 10 * time.Second,
RandomDelay: 2 * time.Second,
})
url := fmt.Sprintf(wishlistUrl, d.PersonId, (page-1)*15)
c.OnHTML("div[class='item comment-item']", func(e *colly.HTMLElement) {
if !strings.HasPrefix(e.Request.URL.String(), "https://movie.douban.com/people") {
return
}
e.DOM.Find("div[class='pic'] a[title]").Each(func(i int, selection *goquery.Selection) {
println(selection.Attr("href"))
url, ok := selection.Attr("href")
if ok {
c.Visit(url)
}
})
})
c.OnHTML("#content", func(h *colly.HTMLElement) {
var item importlist.Item
h.DOM.Find("h1").Each(func(i int, selection *goquery.Selection) {
selection.Find("span[property]").Each(func(i int, selection *goquery.Selection) {
println(selection.Text())
item.Title = selection.Text()
})
selection.Find("span[class='year']").Each(func(i int, selection *goquery.Selection) {
n, _ := strconv.Atoi(selection.Text())
item.Year = n
})
})
h.DOM.Find("#info").Each(func(i int, s *goquery.Selection) {
info := strings.TrimSpace(s.Text())
lines := strings.Split(info, "\n")
if len(lines) == 0 {
return
}
last := lines[len(lines)-1]
if !strings.HasPrefix(strings.ToLower(last), "imdb") {
return
}
ss := strings.Split(last, ":")
for _, p := range ss {
p := strings.TrimSpace(strings.ToLower(p))
if strings.HasPrefix(p, "tt") {
item.ImdbID = p
}
}
})
log.Info(item)
})
return nil, c.Visit(url)
}
func doHttpReq(method, url string, body io.Reader) (*http.Response, error) {
req, err := http.NewRequest(method, url, body)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", ua)
return http.DefaultClient.Do(req)
}

View File

@@ -9,3 +9,10 @@ func TestParseDoulist(t *testing.T) {
r, err := ParseDoulist("https://www.douban.com/doulist/81580/")
log.Info(r, err)
}
func Test111(t *testing.T) {
d := NewDoubanWishlist("69894889")
_, err := d.GetWishlist(1)
log.Infof("err: %v", err)
}