文本挖掘HW3
2018-07-17 本文已影响0人
在做算法的巨巨
import os
import os.path
import codecs
import pandas as pd
import numpy as np
filePaths = []
fileContents=[]
a=os.walk("C:/Users/dell/Desktop/datamining/2.1+语料库/2.1/SogouC.mini/Sample")
for root, dirs, files in a:
for name in files:
filePath=os.path.join(root,name)
filePaths.append(filePath)
f = codecs.open(filePath, 'r','utf-8')
fileContent=f.read()
f.close()
fileContents.append(fileContent)
corpos = pd.DataFrame({'filePath': filePaths,'fileContent':fileContents})
corpos
segments=[]
filePaths =[]
for index, row in corpos.iterrows():
filePath = row['filePath']
fileContent = row['fileContent']
segs = jieba.cut(fileContent)
for seg in segs:
segments.append(seg)
filePaths.append(filePath)
segmentDataFrame = pd.DataFrame({'segment':segments,'filePath':filePaths})
segmentDataFrame
corpos.iterrows
segStat = segmentDataFrame.groupby(by='segment')["segment"].agg({"计数":np.size}).reset_index().sort_values('计数',ascending=False)
segmentDataFrame
我们发现存在jieba切分后有一些停用词在干扰,类似空格、标点以及一些中文中的介词助词等等。
所以,此时我们需要导入一个停用词库,停用词库中的词就不要放入切词数组中。
在这里需要注意的是,DataFrame没有sort这个属性,查了下,DataFrame有sort_values,
具体用法就是.sort_values('列名', ascending=False)
stopwords = pd.read_csv("C:\\Users\\dell\\Desktop\\datamining\\2.3\\StopwordsCN.txt",encoding='utf-8',index_col=False)
fSegStat = segStat[~segStat.segment.isin(stopwords.stopword)]
fSegStat
segments=[]
filePaths =[]
for index, row in corpos.iterrows():
filePath = row['filePath']
fileContent = row['fileContent']
segs = jieba.cut(fileContent)
for seg in segs:
if seg not in stopwords.stopword.values and len(seg.strip())>1:
segments.append(seg)
filePaths.append(filePath)
segmentDataFrame = pd.DataFrame({'segment':segments,'filePath':filePaths})
词云图
地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
pip install wordcloud-1.4.1-cp36-cp36m-win_amd64.whl
segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'计数':np.size}).reset_index().sort_values('计数',ascending=False)
fSegStat = segStat[~segStat.segment.isin(stopwords.stopword)]
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud=WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\simhei.ttf',background_color='black')
words = fSegStat.set_index('segment').to_dict()
wordcloud.fit_words(words['计数'])
plt.imshow(wordcloud)
plt.show()
网上找了一篇有关燃料电池发展状况的分析报告:将文本转化为txt。
f = codecs.open('C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\fuelcell.txt', 'r', 'gbk')
txt=f.read()
txtcontent=jieba.cut(txt)
contents=[]
for content in txtcontent:
if content not in stopwords.stopword.values and len(content.strip())>1:
contents.append(content)
contentDataFrame=pd.DataFrame({'content':contents})
contentStat=contentDataFrame.groupby(by='content')['content'].agg({'计数':np.size}).reset_index().sort_values('计数',ascending=False)
wordcloud=WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\simhei.ttf',background_color='black')
words = contentStat.set_index('content').to_dict()
wordcloud.fit_words(words['计数'])
plt.imshow(wordcloud)
plt.show()