Python爬虫

以《无双》为例,使用Python爬取豆瓣短评绘制词云

2018-11-03  本文已影响0人  龙小江i

有时候,假的要比真的还要好 ——《无双》

# 导入所需库
import re,jieba,numpy,requests
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from wordcloud import WordCloud

# 输入电影ID和采集页数
id = input('输入电影ID:')
pages = input('采集多少页评论:')
pages = int(pages)

# 定义采集函数
def select_comments():
    
    # 爬取短评
    url = 'https://movie.douban.com/subject/'+id+'/'
    get = requests.get(url,)
    bs = BeautifulSoup(get.text,'lxml')
    mod = bs.findAll('div','mod-bd')
    short = mod[0].findAll('span','short')
    mvshort = []
    for i in short:
        mvshort.append(i.string)
    words = []
    for i in mvshort:
        if i == '\n':
            continue
        elif len(i) <= 1:
            continue
        else:
            words.append(i)

    # 将短评列表转为字符串
    comments = ''
    for k in range(len(words)):
        comments = comments + (str(words[k])).strip()

    # 使用正则表达式提取中文
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)
    return cleaned_comments

# 定义指定页数采集函数
def select_all_comments(ID, Name, Page):
    empty = ''
    page = 0
    for i in range(Page-1):
        url = 'https://movie.douban.com/subject/{}/comments?start={}&limit=20&sort=new_score&status=P'.format(ID, page)
        L = select_comments()
        empty = empty + L
        page += 20
    return empty

# 开始采集
cleaned_comments = select_all_comments(id,id,pages)

# 使用jieba库分词
segment = jieba.lcut(cleaned_comments)
words_df=pd.DataFrame({'segment':segment})

# 去除停用词
stopwords=pd.read_csv('C:/Users/longxiaojiangi/Desktop/stopwords/chinese.txt',sep='\t',quoting=3,names=['stopword'])
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

# 词频统计
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)

# 根据统计的词频绘制词云
%matplotlib inline
cloud = WordCloud(font_path='C:\\Windows\\Fonts\\simsun.ttc',width=500,height=300)
pic_cloud = {x[0]:x[1] for x in words_stat.values}
pic_cloud = cloud.fit_words(pic_cloud)
plt.imshow(pic_cloud,interpolation='bilinear')
plt.axis('off')
plt.show()
上一篇下一篇

猜你喜欢

热点阅读