文本分类挖掘预测
2019-01-04 本文已影响0人
jackmanzhang
首先说明内容有些简单( (⊙o⊙),仅供参考)
文本预测数据(由于数据太多再次测试1w条数据即test的数据集)
数据集地址下载:
https://pan.baidu.com/share/init?surl=XIZwRlG4-yynR9fSEAdRiA
密码:kxxa
首先将把需要测试的数据集暂时保存下来,进行分词,关键词提取,集合并集,变换特征向量等操作,把关键词和特征向量的内容保存在文本里;
import jieba
import jieba.analyse
import numpy as np
f_train='C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/cnews/cnews.test.txt'
list_x=[]
list_y=[]
# while True:
# print('第一次请注释,以后请注释,运行last2')
with open(f_train,'r',encoding='utf-8') as file_train:
for i in file_train:
line_list=i.split('\t')
list_x.append(line_list[1])
list_y.append(line_list[0])
# print(i)
# print(list_train)
###分词 存储分词
##+++++++++++++++++++++++++++++++++++++++++++++++
for count,article in enumerate(list_x):
segment_i=jieba.analyse.extract_tags(article,topK=10,withWeight=False,allowPOS=())
list_x[count]=segment_i
# if count>10:
# break
print(list_x[0])
# # list1.append(segment_i)
# ##转换变为 1,0
# ##所有训练集的并集(union)
set_union={}
print(type(set_union))
count=0
for i in list_x:
count+=1
print(count)
set_union=set(set_union)|set(i)
print(len(set_union))
#计算并集(转化为词向量)
list_set_union=list(set_union)
print(list_set_union)
with open('特征集合变换列表00.txt','w+') as filelist:
filelist.write(str(list_set_union))
# with open('特征集合变换列表.txt','r') as f:
# list_set_union=f.read()
# list_set_union=eval(list_set_union)
# print(list_set_union)
# print(len(list_set_union))
print('*'*100)
list_all=[]
count=0
with open('all00.txt', 'w+') as file:
for x in list_x:
count+=1
print('count:',count)
list_one=[0 for i in range(len(list_set_union))]
for i in x:
for k,v in enumerate(list_set_union):
if v==i:
list_one[k]=1
break
file.write(str(list_one) + '\n')
然后在第二个python文件里读取刚才保存的文件,如果写在一起每次都会重新生成比较慢,所以在此小编写了两个文件。便于操作。
from sklearn.linear_model import LogisticRegression
import jieba
import jieba.analyse
list_all=[]
list_y=[]
f_train='C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/cnews/cnews.test.txt'
with open(f_train,'r',encoding='utf-8') as file_train:
for i in file_train:
line_list=i.split('\t')
list_y.append(line_list[0])
print(list_y)
set_y=set(list_y)
print(set_y)
list_set_y=list(set_y)
print(list_set_y)
dict_set_y={}
for k,v in enumerate(list_set_y):
dict_set_y[k]=v
for i,j in enumerate(list_y):
for k,v in enumerate(list_set_y):
if j==v:
list_y[i]=k
break
print(list_y)
##列表的形式转换成字符串
with open('all0.txt','r') as f:
file=f.readlines()
for k,i in enumerate(file):
i=i.replace('\n','')
i=eval(i)
# print(k)
list_all.append(i)
# print(list_all)
print(len(list_all))
lr_model = LogisticRegression()
lr_model.fit(list_all, list_y)
with open('特征集合变换列表0.txt','r') as f:
list_set_union=f.read()
list_set_union=eval(list_set_union)
# print(list_set_union)
# print(len(list_set_union))
while True:
cheshi=input('测试:')
segment_i=jieba.analyse.extract_tags(cheshi,topK=10,withWeight=False,allowPOS=())
# print(segment_i)
list_one = [0 for i in range(len(list_set_union))]
for x in segment_i:
for k,v in enumerate(list_set_union):
if v==x:
list_one[k]=1
break
# print(list_one)
s=lr_model.predict([list_one])
print(dict_set_y[s[0]])
直到这里基本可以完成简单预测,下面进行一个简单的前后端界面交互。利用django进行交互,简单说明一下建项目的流程。
image.pngimage.png
image.png
注意如果建立了static的包要在setting里,一般最后加上,没建立这个包就不用了其他的内容暂时不需要更改
STATICFILES_DIRS = [
os.path.join(BASE_DIR, 'static'),
]
image.png
image.png
from django.conf.urls import url
from . import views
urlpatterns = [
url(r'^$',views.index),
url(r'^serach/$',views.serach),
]
image.png
from django.shortcuts import render
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import check_array as check_arrays
import jieba
import time
import jieba.analyse
from django.shortcuts import render,HttpResponse,HttpResponseRedirect,redirect
# Create your views here.
def index(request):
return render(request, 'index.html')
def serach(request):
cheshi=request.POST.get('cheshi')
# print(content)
mysession=request.session.get('mysession0','')
list_all = []
list_y = []
start=time.time()
f_train = 'C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/cnews/cnews.test.txt'
with open(f_train, 'r', encoding='utf-8') as file_train:
for i in file_train:
line_list = i.split('\t')
list_y.append(line_list[0])
# print(list_y)
set_y = set(list_y)
print(set_y)
list_set_y = list(set_y)
print(list_set_y)
dict_set_y = {}
for k, v in enumerate(list_set_y):
dict_set_y[k] = v
for i, j in enumerate(list_y):
for k, v in enumerate(list_set_y):
if j == v:
list_y[i] = k
break
# print(list_y)
if mysession == '':
##列表的形式转换成字符串
with open('C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/all0.txt', 'r') as f:
file = f.readlines()
for i in file:
i = i.replace('\n', '')
i = eval(i)
# print(i)
list_all.append(i)
# print(list_all)
# print(len(list_all))
with open('C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/特征集合变换列表0.txt', 'r') as f:
list_set_union = f.read()
list_set_union = eval(list_set_union)
# print(list_set_union)
# print(len(list_set_union))
request.session['mysession0'] = list_all
request.session['mysession1'] = list_set_union
# print('*'*100)
s0=time.time()
list_all=request.session['mysession0']
list_set_union=request.session['mysession1']
s1=time.time()
# print('session:',s1-s0)
# print('*' * 100)
s0=time.time()
lr_model = LogisticRegression()
lr_model.fit(list_all, list_y)
s1 = time.time()
print('逻辑:', s1 - s0)
# print('*' * 100)
# cheshi = input('测试:')
segment_i = jieba.analyse.extract_tags(cheshi, topK=10, withWeight=False, allowPOS=())
# print(segment_i)
list_one = [0 for i in range(len(list_set_union))]
for x in segment_i:
for k, v in enumerate(list_set_union):
if v == x:
list_one[k] = 1
break
# print(list_one)
s = lr_model.predict([list_one])
answer=dict_set_y[s[0]]
print(answer)
end=time.time()
print(end-start)
ctx={
'content':answer
}
return render(request, 'index.html',ctx)
最后我们在模板templates的文件夹中编写简单前端程序。
image.png<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
<script src="/static/js/jquery-1.12.4.min.js"></script>
</head>
<body>
{#<img src="/static/img/1.jpg">#}
<div style="text-align: center;margin-top: 100px">
<form action="/serach/" method="post" >
{% csrf_token %}
<textarea cols="50%" rows="10" name="cheshi" id="tt"></textarea><br/>
<input type="submit" id="submit"><br>
<input type="text" value="{{ content }}" name="over">
</form>
<script>
$("#submit").click(function () {
if($("#tt").val()==''){
alert('不能发空')
return false
}
})
</script>
</div>
</body>
</html>