词云

2020-02-19  本文已影响0人  dingtom
import requests
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 爬取数据
for i in range(0, 200, 20):
    # 通过浏览器“检查”分析,得到URL数据接口。在不断往下刷新页面的过程中,发现URL中只有“start”参数不断产生变化,依次为0,20,40,60,80
    url = 'https://m.douban.com/rexxar/api/v2/gallery/topic/125573/items?' \
          'sort=new&start={}&count=20&status_full_text=1&guest_only=0&ck=null'.format(i)
    print(url)
    # 破解防爬虫,带上请求头
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36                              (KHTML, like Gecko) Chrome/76.0".3809.100 Safari/537.36',
               'Referer': 'https://www.douban.com/gallery/topic/125573/?from=gallery_trend                            &sort=hot'}
    # 发送请求,获取响应
    reponse = requests.get(url, headers=headers)
    html = reponse.json()
    # 解析数据,获得短评
    # 保存到本地
    for j in range(19):
        abst = html['items'][j]['abstract']
        with open("want_after.txt", "a", encoding='utf-8') as f:
            f.write(abst)
            print(abst)

# 获得wordcloud 需要的文本格式
with open("want_after.txt", "r", encoding='utf-8') as f:
    words = ' '.join(jieba.cut(f.read(), cut_all=False))
backgroud_Image = plt.imread(r'C:\Users\tomding\Videos\图片1.png')  # 背景图
# 词云的一些参数设置
wcloud = WordCloud(
      background_color='white',
      mask=backgroud_Image,
      font_path=r'c:\\windows\\Fonts\\simhei.ttf',
      max_words=200,
      max_font_size=200,
      min_font_size=8,
      random_state=50)

# 生成词云
word_cloud = wcloud.generate_from_text(words)
plt.imshow(word_cloud)
plt.axis('off')
wcloud.to_file('结果.jpg')

# 看看词频高的有哪些
process_word = WordCloud.process_text(wcloud, words)
words_sorted = sorted(process_word.items(), key=lambda i: i[1], reverse=True)
sort_after = words_sorted[:50]
print(sort_after)

# 把数据存成csv文件
df = pd.DataFrame(sort_after)
# 保证不乱码
df.to_csv('sort_after.csv', encoding='utf_8_sig')
上一篇 下一篇

猜你喜欢

热点阅读