处理英文文本数据
2019-01-24 本文已影响0人
dreampai
1、词袋模型
- 分词
- 构建此表
- 编码
2、去除停用词
3、TF-IDF 缩放数据
4、逻辑回归模型系数
负系数属于负面评论的单词,正系数属于正面评论的单词。
![](https://img.haomeiwen.com/i13140540/781f0d562ba27d42.png)
5、N-元模型
![](https://img.haomeiwen.com/i13140540/39ed55d51fc236d7.png)
从热图可以看出,使用二元分词对性能有很大的提高,而添加三元分词对精度只有很小贡献。
6、主题模型
![](https://img.haomeiwen.com/i13140540/32c6b57e77505700.png)
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import mglearn
import matplotlib.pyplot as plt
# 加载数据
reviews_train=load_files('aclImdb/train/')
text_train,y_train=reviews_train.data,reviews_train.target
reviews_test=load_files('aclImdb/test/')
text_test,y_test=reviews_test.data,reviews_test.target
print('length of text_train:{}'.format(len(text_train)))
print('Samples per class(training):{}'.format(np.bincount(y_train)))
print('Number of documents in test data:{}'.format(len(text_test)))
print('Samples per class(test):{}'.format(np.bincount(y_test)))
# 去除换行符号
text_train=[doc.replace(b'<br />',b' ') for doc in text_train]
text_test=[doc.replace(b'<br />',b' ') for doc in text_test]
# 词袋模型
vect=CountVectorizer(min_df=5,stop_words='english')
vect.fit(text_train)
X_train=vect.transform(text_train)
X_test=vect.transform(text_test)
feature_names=vect.get_feature_names()
print('Number of features:{}'.format(len(feature_names)))
# 交叉验证
scores=cross_val_score(LogisticRegression(),X_train,y_train,cv=5)
print('Mean cross-validation accuracy:{:.2f}'.format(np.mean(scores)))
# 网格搜索
param_grid={'C':[0.001,0.01,0.1,1,10]}
grid=GridSearchCV(LogisticRegression(),param_grid,cv=5)
grid.fit(X_train,y_train)
print('Best cross-validation score:{:.2f}'.format(grid.best_score_))
print('Best parameters:',grid.best_params_)
print('{:.2f}'.format(grid.score(X_test,y_test)))
# TFIDF 模型
pipe=make_pipeline(TfidfVectorizer(min_df=5,stop_words='english'),LogisticRegression())
param_grid={'logisticregression__C':[0.001,0.01,0.1,1,10],
'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)]}
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(text_train,y_train)
print('Best cross-validation score:{:.2f}'.format(grid.best_score_))
vectorizer=grid.best_estimator_.named_steps['tfidfvectorizer']
X_train=vectorizer.transform(text_train)
max_value=X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf=max_value.argsort()
feature_names=np.array(vectorizer.get_feature_names())
print('Features with lowest tfidf:\n{}'.format(feature_names[sorted_by_tfidf[:20]]))
print('Features with highest tfidf:\n{}'.format(feature_names[sorted_by_tfidf[-20:]]))
sorted_by_idf=np.argsort(vectorizer.idf_)
print('Feature with lowest idf:\n{}'.format(feature_names[sorted_by_idf[:100]]))
scores=grid.cv_results_['mean_test_score'].reshape(-1,3).T
heatmap=mglearn.tools.heatmap(
scores,xlabel='C',ylabel='ngram_range',cmap='viridis',fmt='%.3f',
xticklabels=param_grid['logisticregression__C'],
yticklabels=param_grid['tfidfvectorizer__ngram_range']
)
plt.colorbar(heatmap)
#研究模型系数
mglearn.tools.visualize_coefficients(grid.best_estimator_.named_steps['logisticregression'].coef_,feature_names,n_top_features=40)
plt.show()
#主题模型
vect=CountVectorizer(max_features=10000,max_df=.15)
X=vect.fit_transform(text_train)
lda=LatentDirichletAllocation(n_topics=10,learning_method='batch',max_iter=25,random_state=0)
document_topics=lda.fit_transform(X)
print(lda.components_.shape)
sorting=np.argsort(lda.components_,axis=1)[:,::-1]
feature_names=np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(10),feature_names=feature_names,sorting=sorting,topics_per_chunk=5,n_words=10)
plt.show()