go-colly
2022-12-08 本文已影响0人
hehehehe
package main
import (
"fmt"
"github.com/gocolly/colly"
"regexp"
"strings"
)
func main() {
c := colly.NewCollector()
c.DetectCharset = true
// On every a element which has href attribute call callback
c.OnHTML("tr[class='provincetr']", func(e *colly.HTMLElement) {
e.ForEachWithBreak("a", func(i int, element *colly.HTMLElement) bool {
provName := element.Text
cityLastUrl := element.Attr("href")
provUrl := element.Request.URL.String()
cityUrl := strings.Replace(provUrl, "index.html", cityLastUrl, -1)
fmt.Println(provName, cityUrl)
c.Visit(cityUrl)
return false
})
})
c.OnHTML("tr[class='citytr']", func(e *colly.HTMLElement) {
cityCode := e.DOM.Find("td:nth-child(1)").Find("a:first-child").Text()
cityName := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Text()
adLastUrl, _ := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Attr("href")
cityUrl := e.Request.URL.String()
reg := regexp.MustCompile("\\d+.html")
adUrl := reg.ReplaceAll([]byte(cityUrl), []byte(adLastUrl))
fmt.Println(cityCode, cityName, adLastUrl, string(adUrl))
c.Visit(string(adUrl))
return
})
c.OnHTML("tr[class='countytr']", func(e *colly.HTMLElement) {
adCode := e.DOM.Find("td:nth-child(1)").Find("a:first-child").Text()
adName := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Text()
townLastUrl, _ := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Attr("href")
adUrl := e.Request.URL.String()
reg := regexp.MustCompile("\\d+.html")
townUrl := reg.ReplaceAll([]byte(adUrl), []byte(townLastUrl))
fmt.Println(adCode, adName, townLastUrl, string(townUrl))
//c.Visit(string(adUrl))
return
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
// Start scraping on https://hackerspaces.org
c.Visit("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html")
}