分词
2020-02-06 本文已影响0人
月夜星空下
import jieba
import re
word = input("请输入:")
# 创建停用词列表
def stopwordslist():
stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
return stopwords
cleaned_data = re.findall(u"[\u4e00-\u9fa5]+",word)
# print(cleaned_data)
r = ''
for ic in cleaned_data:
b = str(cleaned_data)
i = str(ic)
r += ic
a = jieba.lcut(r)
print("正在分词...")
# print(a)
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in a:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
# outstr = outstr.replace(" ","")
print(outstr)
print(type(outstr))