golang从零起步

爬虫自动转码,获取城市列表:城市名称+URL

2018-08-24  本文已影响0人  次序

//下载插件工具
go get -v github.com/gpmgo/gopm

//自动导入包插件
gopm get -g -v golang.org/x/tools/cmd/goimports

//转码插件
gopm get -g -v golang.org/x/text

//自动检测网页编码
gopm get -g -v golang.org/x/net/html

package main

import (
    "net/http"
    "io/ioutil"
    "fmt"
    "golang.org/x/text/transform"
    "io"
    "golang.org/x/text/encoding"
    "golang.org/x/net/html/charset"
    "bufio"
    "regexp"
)

func main() {
    resp, err := http.Get("http://www.zhenai.com/zhenghun")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()
    if resp.StatusCode != http.StatusOK {
        fmt.Println("Error:status code", resp.StatusCode)
        return
    }
    e := determinEncoding(resp.Body)
    utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
    all, err := ioutil.ReadAll(utf8Reader)
    if err != nil {
        panic(err)
    }
    printCityList(all)

}
func printCityList(contents []byte) {
    re := regexp.MustCompile(`<a href="(http://www.zhenai.com/zhenghun/[0-0a-z]+)"[^>]*>([^<]+)</a>`)
    matches := re.FindAllSubmatch(contents, -1)
    for _, m := range matches {
        fmt.Printf("City: %s, URL: %s\n", m[2], m[1])
    }
    fmt.Printf("Matches found:%d\n", len(matches))
}

func determinEncoding(r io.Reader) encoding.Encoding {
    bytes, err := bufio.NewReader(r).Peek(1024)
    if err != nil {
        panic(err)
    }
    e, _, _ := charset.DetermineEncoding(bytes, "")
    return e
}

上一篇下一篇

猜你喜欢

热点阅读