python数据分析-豆瓣电影Top250
2017-12-30 本文已影响0人
931180482c82
数据爬取源码
详细过程大家可以在其他帖子中看到.
这里采用python2.7以及原生库urllib2和re库进行爬取.
# coding=utf-8
import urllib2
import re
import time
def get_Request(page):
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
'Host': 'movie.douban.com',
}
req = urllib2.Request(url + '?start=' + str(page) + '&filter=', headers=headers)
res = urllib2.urlopen(req).read()
return res
def get_content(res):
total = re.findall('<div class="item">([\s\S]+?)</div>\s+</li>', res, re.S)
all_data = list()
for item in total:
try:
title = re.findall('<span class="title">(.*?)</span>', item)
count = re.findall('<em class="">(\d+)</em>', item)
director = re.findall('<p class="">\s+导演:\s+(.*?) ', item)
other = re.findall('<br>\s+(\d{4,}) / (.*?) / (.*?)\s+</p>', item)
score = re.findall('<span class="rating_num" property="v:average">(\d\.\d+)</span>', item)
name = title[0]
other_name = ''
counts = count[0]
direct = director[0]
year = other[0][0]
country = other[0][1].replace(' ', '/'),
type = other[0][2].replace(' ', '/'),
scores = score[0],
if len(title) > 1:
other_name = title[1].replace(' / ', '').replace(',', ' '),
else:
title.append('0')
other_name = title[1]
data = dict(
name=name,
other_name=other_name[0],
director=direct,
year=year,
country=country[0],
type=type[0],
score=scores[0],
)
all_data.append(data)
except:
all_data.append(data)
return all_data
def save_data(data):
value = ''
with open('data.txt', 'a') as f:
for line in data:
for values in ['name', 'other_name', 'director', 'year', 'country', 'type', 'score']:
if values == 'score':
value += line[values]
else:
value += line[values] + ','
f.write(str(value) + '\n')
print value + '\n'
value = ''
f.close()
def run(page):
res = get_Request(page)
data = get_content(res)
save_data(data)
if __name__ == "__main__":
page = 0
while page < 250:
run(page=page)
page += 25
time.sleep(0.5)
print 'finished data crawl'
数据分析
影片类型分析
这里对于爬取下来的数据集,并对类型中的"/"进行切割.
分割前:
1.jpg