Python爬取豆瓣电影TOP250
2018-08-02 本文已影响0人
3ni
部分代码引用于此:链接
运行于macOS 10.13.6 python2.7
代码:
# -*- coding:utf-8 -*-
import urllib
import urllib2
# useragent 存放着各个浏览器的User-Agent,自己写的模块,非系统库
import useragent
import BeautifulSoup
import re
import csv
import sys
url = 'https://movie.douban.com/top250'
def get_movie_info(req):
res = urllib2.urlopen(req)
page = res.read()
soup = BeautifulSoup.BeautifulSoup(page)
data = soup.find('ol', {'class': 'grid_view'})
li = data.findAll('li')
record = []
for l in li:
rank = l.find('em').getText()
name = l.find('img')['alt']
info = l.find('p').getText()
director = re.findall('导演: (.*?)  ', info.encode('utf-8'))
if len(director) == 0:
director = '佚名'
else:
director = director[0]
starring = re.findall('主演: (.*?) /...', info.encode('utf-8'))
if len(starring) == 0:
starring = '佚名'
else:
starring = starring[0]
year = re.search(r'\d{4}', info).group()
area = re.findall('/ (.*?) ', info)[0]
grade = l.findAll('span', {'class': 'rating_num'})[0].getText()
quote = l.findAll('span', {'class': 'inq'})
if len(quote) == 0:
quote = '无'
else:
quote = quote[0].getText()
record.append([rank, name, director, starring, year, area, grade, quote])
return record
def start(url):
head = ['排名', '名字', '导演', '主演', '年份', '地区', '评分', '简介']
with open('doubantop250.csv', mode='w') as f:
fd = csv.writer(f)
fd.writerow(head)
for page in range(0, 250, 25):
user_agent = useragent.osx_user_agent
values = {'start': page, 'filter': None}
headers = {'User-Agent': user_agent}
data = urllib.urlencode(values)
request = urllib2.Request(url=url, data=data, headers=headers)
print url + '?' + request.data
movie_info = get_movie_info(request)
for record in movie_info:
fd.writerow(record)
def main():
reload(sys)
sys.setdefaultencoding('utf-8')
start(url)
if __name__ == '__main__':
main()