py爬豆瓣影评2.0
2020-01-15 本文已影响0人
少年阿蒿
from urllib import request
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
import random
全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
设置代理
proxy = ['49.77.209.30:9999','111.72.25.193:9999','223.199.30.119:9999']
proxies ={
'https':'https://' + str(random.choice(proxy))
}
获取数据
def get_data (page):
url = 'https://movie.douban.com/subject/30394535/comments?start=' + str(page) +'&limit=20&sort=new_score&status=P'
headers ={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
print(proxies)
print(url)
#req = request.Request(url, headers = headers ,proxies = proxies)
req = requests.get(url ,proxies = proxies)
response = request.urlopen(req)
if response.getcode() == 200:
data = response.read()
#print(type(data))
#print(data)
data = str(data, encoding='utf-8')
#print(data)
with open('index2.html', mode='a', encoding='utf-8') as f:
f.write(data)
处理数据
def parse_data():
with open('index2.html', mode='r', encoding='utf-8') as f:
html = f.read()
#print(html)
bs = BeautifulSoup(html,'html.parser')
value = bs.select('.comment-info a')[0].get_text(strip=True)
value2 = bs.select('.short')[0].get_text(strip=True)
#print(value, value2)
# <div class="mod-bd" id="comments">
divs = bs.select('#comments .comment-item')
#print(divs)
result = []
for div in divs[1:]:
try:
nickname = div.select('.comment-item .comment .comment-info a')[0].get_text(strip=True)
comment = div.select('.comment-item .comment .short')[0].get_text(strip=True)
score = div.select('.comment-item .comment .comment-info span')[1]['title']
cdate = div.select('.comment-item .comment .comment-info span')[2]['title']
#print(nickname)
#print(comment)
#print(score)
#print(cdate)
row={
'nickname': nickname,
'comment': comment,
'score': score,
'cdate': cdate,
}
result.append(row)
#print(result)
except:
pass
continue
return result
保存数据
def save_to_excel(data):
book = Workbook()
sheet = book.create_sheet('豆瓣短评',0)
sheet.append(['用户','评论内容','用户评分','评论时间'])
for item in data:
row = [item['nickname'], item['comment'], item['score'], item['cdate']]
sheet.append(row)
#print(row)
book.save('豆瓣评论.xlsx')
if name == 'main':
for i in range(12):
i = i*20
get_data(i)
time.sleep(10)
parse_data()
save_to_excel(parse_data())