Go语言爬取豆瓣电影排名
2018-11-25 本文已影响0人
大学渣PG
前面学习了使用常用的正则表达式和Go语言的IO,今天将会使用实现一个简单的爬虫来爬取豆瓣电影排名
1.首先对豆瓣网电影排名网页URL进行分析
这是前三页的URL
https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
经过分析可以得到一个通用的表达式
"https://movie.douban.com/top250?start="+strconv.Itoa(i*25)+"&filter="
2.分析网页源码
明确自己需要哪些数据
image.png
写出需要数据的正则表达式
电影名:<span class="title">([\u4E00-\u9FA5]+)</span>
导演:(.+)
国家和类型: / ([\u4E00-\u9FA5]+.*) / ([\u4E00-\u9FA5]+.*)[^</span>]$
分数:<span class="rating_num" property="v:average">([0-9]+\.[0-9]+)</span>
评价人数:<span>([0-9]+)人评价</span>
//由于Go语言的汉字表示是[\p{Han}]所以等会儿会用[\p{Han}]替换[\u4E00-\u9FA5]
3.完成代码
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"regexp"
"strconv"
"strings"
)
//给定url去获得http响应并转为字符串返回
func httpRsp2String(url string) (string, error) {
rsp, err := http.Get(url)
if err != nil {
return "", err
}
defer rsp.Body.Close()
tmpByteSlic := make([]byte, int(1<<15))
result := ""
for {
n, err1 := rsp.Body.Read(tmpByteSlic)
if n == 0 && err1 == io.EOF {
break
}
result += string(tmpByteSlic[0:n])
}
return result, nil
}
func main() {
filmNames := make([]string, 0)
directors := make([]string, 0)
commentNums := make([]string, 0)
scores := make([]string, 0)
countrys := make([]string, 0)
kinds := make([]string, 0)
for i := 0; i < 10; i++ {
result, err := httpRsp2String("https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter=")
if err != nil {
log.Fatal(err)
return
}
filmNameRegex := regexp.MustCompile(`<img width="100" alt="(?s:(.*?))" src="`)
fileNameSlice := filmNameRegex.FindAllStringSubmatch(result, -1)
for _, v := range fileNameSlice {
filmNames = append(filmNames, v[1])
}
directorRegex := regexp.MustCompile(`导演:\s*(.+)...<br>`)
directorSlice := directorRegex.FindAllStringSubmatch(result, -1)
for _, v := range directorSlice {
tmpStr := strings.Replace(v[1], ` `, "", -1)
tmpStr = strings.Split(tmpStr, "主")[0]
directors = append(directors, tmpStr)
}
scoreRegex := regexp.MustCompile(`<span class="rating_num" property="v:average">([0-9]+\.[0-9]+)</span>`)
scoreSlice := scoreRegex.FindAllStringSubmatch(result, -1)
for _, v := range scoreSlice {
scores = append(scores, v[1])
}
commentNumRegex := regexp.MustCompile(`<span>([0-9]+)人评价</span>`)
commentNumSlice := commentNumRegex.FindAllStringSubmatch(result, -1)
for _, v := range commentNumSlice {
commentNums = append(commentNums, v[1])
}
countryAndKindRegex := regexp.MustCompile(` / ([\p{Han}]+.*) / ([\p{Han}]+.*)`)
countryAndKindSlice := countryAndKindRegex.FindAllStringSubmatch(result, -1)
for _, v := range countryAndKindSlice {
countrys = append(countrys, v[1])
kinds = append(kinds, v[2])
}
}
_, err := os.Create("./result.txt")
if err != nil {
log.Fatal(err)
return
}
file, err := os.OpenFile("./result.txt", os.O_APPEND|os.O_RDWR, 0644)
defer file.Close()
if err != nil {
log.Fatal(err)
return
}
file.Write([]byte(fmt.Sprintf("%-5s|%-50s|%-20s|%-40s|%-40s|%-5s|%-20s\n","排名","电影名","国家","导演","类型","分数","评价人数")))
for i := 0; i < 250; i++ {
file.Write([]byte(fmt.Sprintf("%-5s|%-50s|%-20s|%-40s|%-40s|%-5s|%-20s\n",strconv.Itoa(i+1),filmNames[i],countrys[i],directors[i], kinds[i],scores[i],commentNums[i])))
}
}