go-colly

2022-12-08  本文已影响0人  hehehehe
package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "regexp"
    "strings"
)

func main() {
    c := colly.NewCollector()
    c.DetectCharset = true
    // On every a element which has href attribute call callback
    c.OnHTML("tr[class='provincetr']", func(e *colly.HTMLElement) {
        e.ForEachWithBreak("a", func(i int, element *colly.HTMLElement) bool {
            provName := element.Text
            cityLastUrl := element.Attr("href")
            provUrl := element.Request.URL.String()
            cityUrl := strings.Replace(provUrl, "index.html", cityLastUrl, -1)
            fmt.Println(provName, cityUrl)
            c.Visit(cityUrl)
            return false
        })

    })

    c.OnHTML("tr[class='citytr']", func(e *colly.HTMLElement) {
        cityCode := e.DOM.Find("td:nth-child(1)").Find("a:first-child").Text()
        cityName := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Text()
        adLastUrl, _ := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Attr("href")
        cityUrl := e.Request.URL.String()
        reg := regexp.MustCompile("\\d+.html")
        adUrl := reg.ReplaceAll([]byte(cityUrl), []byte(adLastUrl))
        fmt.Println(cityCode, cityName, adLastUrl, string(adUrl))
        c.Visit(string(adUrl))
        return
    })

    c.OnHTML("tr[class='countytr']", func(e *colly.HTMLElement) {
        adCode := e.DOM.Find("td:nth-child(1)").Find("a:first-child").Text()
        adName := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Text()
        townLastUrl, _ := e.DOM.Find("td:nth-child(2)").Find("a:first-child").Attr("href")
        adUrl := e.Request.URL.String()
        reg := regexp.MustCompile("\\d+.html")
        townUrl := reg.ReplaceAll([]byte(adUrl), []byte(townLastUrl))
        fmt.Println(adCode, adName, townLastUrl, string(townUrl))
        //c.Visit(string(adUrl))
        return
    })

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL.String())
    })

    // Start scraping on https://hackerspaces.org
    c.Visit("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html")

}

上一篇 下一篇

猜你喜欢

热点阅读