Go语言爬取豆瓣电影排名

2018-11-25  本文已影响0人  大学渣PG

前面学习了使用常用的正则表达式和Go语言的IO,今天将会使用实现一个简单的爬虫来爬取豆瓣电影排名

1.首先对豆瓣网电影排名网页URL进行分析

这是前三页的URL
https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=

经过分析可以得到一个通用的表达式

"https://movie.douban.com/top250?start="+strconv.Itoa(i*25)+"&filter="
2.分析网页源码

明确自己需要哪些数据


image.png

写出需要数据的正则表达式

电影名:<span class="title">([\u4E00-\u9FA5]+)</span>
导演:(.+)&nbsp;&nbsp;&nbsp;
国家和类型:&nbsp;/&nbsp;([\u4E00-\u9FA5]+.*)&nbsp;/&nbsp;([\u4E00-\u9FA5]+.*)[^</span>]$
分数:<span class="rating_num" property="v:average">([0-9]+\.[0-9]+)</span>
评价人数:<span>([0-9]+)人评价</span>
//由于Go语言的汉字表示是[\p{Han}]所以等会儿会用[\p{Han}]替换[\u4E00-\u9FA5]
3.完成代码
package main

import (
    "fmt"
    "io"
    "log"
    "net/http"
    "os"
    "regexp"
    "strconv"
    "strings"
)

//给定url去获得http响应并转为字符串返回
func httpRsp2String(url string) (string, error) {
    rsp, err := http.Get(url)
    if err != nil {
        return "", err
    }
    defer rsp.Body.Close()
    tmpByteSlic := make([]byte, int(1<<15))
    result := ""
    for {
        n, err1 := rsp.Body.Read(tmpByteSlic)
        if n == 0 && err1 == io.EOF {
            break
        }
        result += string(tmpByteSlic[0:n])
    }
    return result, nil
}

func main() {
    filmNames := make([]string, 0)
    directors := make([]string, 0)
    commentNums := make([]string, 0)
    scores := make([]string, 0)
    countrys := make([]string, 0)
    kinds := make([]string, 0)
    for i := 0; i < 10; i++ {
        result, err := httpRsp2String("https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter=")
        if err != nil {
            log.Fatal(err)
            return
        }
        filmNameRegex := regexp.MustCompile(`<img width="100" alt="(?s:(.*?))" src="`)
        fileNameSlice := filmNameRegex.FindAllStringSubmatch(result, -1)
        for _, v := range fileNameSlice {
            filmNames = append(filmNames, v[1])
        }

        directorRegex := regexp.MustCompile(`导演:\s*(.+)...<br>`)
        directorSlice := directorRegex.FindAllStringSubmatch(result, -1)
        for _, v := range directorSlice {
            tmpStr := strings.Replace(v[1], `&nbsp;`, "", -1)
            tmpStr = strings.Split(tmpStr, "主")[0]
            directors = append(directors, tmpStr)
        }

        scoreRegex := regexp.MustCompile(`<span class="rating_num" property="v:average">([0-9]+\.[0-9]+)</span>`)
        scoreSlice := scoreRegex.FindAllStringSubmatch(result, -1)
        for _, v := range scoreSlice {
            scores = append(scores, v[1])
        }

        commentNumRegex := regexp.MustCompile(`<span>([0-9]+)人评价</span>`)
        commentNumSlice := commentNumRegex.FindAllStringSubmatch(result, -1)
        for _, v := range commentNumSlice {
            commentNums = append(commentNums, v[1])
        }

        countryAndKindRegex := regexp.MustCompile(`&nbsp;/&nbsp;([\p{Han}]+.*)&nbsp;/&nbsp;([\p{Han}]+.*)`)
        countryAndKindSlice := countryAndKindRegex.FindAllStringSubmatch(result, -1)
        for _, v := range countryAndKindSlice {
            countrys = append(countrys, v[1])
            kinds = append(kinds, v[2])
        }
    }

    _, err := os.Create("./result.txt")
    if err != nil {
        log.Fatal(err)
        return
    }
    file, err := os.OpenFile("./result.txt", os.O_APPEND|os.O_RDWR, 0644)
    defer file.Close()
    if err != nil {
        log.Fatal(err)
        return
    }
    file.Write([]byte(fmt.Sprintf("%-5s|%-50s|%-20s|%-40s|%-40s|%-5s|%-20s\n","排名","电影名","国家","导演","类型","分数","评价人数")))
    for i := 0; i < 250; i++ {
        file.Write([]byte(fmt.Sprintf("%-5s|%-50s|%-20s|%-40s|%-40s|%-5s|%-20s\n",strconv.Itoa(i+1),filmNames[i],countrys[i],directors[i], kinds[i],scores[i],commentNums[i])))
    }
}

上一篇下一篇

猜你喜欢

热点阅读