以《无双》为例,使用Python爬取豆瓣短评绘制词云
2018-11-03 本文已影响0人
龙小江i
有时候,假的要比真的还要好 ——《无双》
- 全部代码
# 导入所需库
import re,jieba,numpy,requests
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from wordcloud import WordCloud
# 输入电影ID和采集页数
id = input('输入电影ID:')
pages = input('采集多少页评论:')
pages = int(pages)
# 定义采集函数
def select_comments():
# 爬取短评
url = 'https://movie.douban.com/subject/'+id+'/'
get = requests.get(url,)
bs = BeautifulSoup(get.text,'lxml')
mod = bs.findAll('div','mod-bd')
short = mod[0].findAll('span','short')
mvshort = []
for i in short:
mvshort.append(i.string)
words = []
for i in mvshort:
if i == '\n':
continue
elif len(i) <= 1:
continue
else:
words.append(i)
# 将短评列表转为字符串
comments = ''
for k in range(len(words)):
comments = comments + (str(words[k])).strip()
# 使用正则表达式提取中文
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
return cleaned_comments
# 定义指定页数采集函数
def select_all_comments(ID, Name, Page):
empty = ''
page = 0
for i in range(Page-1):
url = 'https://movie.douban.com/subject/{}/comments?start={}&limit=20&sort=new_score&status=P'.format(ID, page)
L = select_comments()
empty = empty + L
page += 20
return empty
# 开始采集
cleaned_comments = select_all_comments(id,id,pages)
# 使用jieba库分词
segment = jieba.lcut(cleaned_comments)
words_df=pd.DataFrame({'segment':segment})
# 去除停用词
stopwords=pd.read_csv('C:/Users/longxiaojiangi/Desktop/stopwords/chinese.txt',sep='\t',quoting=3,names=['stopword'])
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
# 词频统计
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
# 根据统计的词频绘制词云
%matplotlib inline
cloud = WordCloud(font_path='C:\\Windows\\Fonts\\simsun.ttc',width=500,height=300)
pic_cloud = {x[0]:x[1] for x in words_stat.values}
pic_cloud = cloud.fit_words(pic_cloud)
plt.imshow(pic_cloud,interpolation='bilinear')
plt.axis('off')
plt.show()
- 备注:《无双》在豆瓣的ID为:26425063,我这里是采集了20页的短评数据。停用词百度一搜一大把,这里我也给一个链接:中文常用停用词表(哈工大停用词表、百度停用词表等)
无双.jpg