17、朴素贝叶斯文本向量化的不同方式

2020-05-12  本文已影响0人  羽天驿

一、短信分类(多种方法)

from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer,
TfidfTransformer
处理文本数据.png

一、不同的文本向量化处理的方式

(一)CountVectorizer

(二)TF-LDE

import numpy as np

# 处理文本数据
# CountVectorizer 词频统计
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

import pandas as pd

from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve,auc

from sklearn.model_selection import StratifiedKFold# 分层交叉验证

import matplotlib.pyplot as plt
# 这些词,助词,主谓宾……
# 这样的词,对类别划分,作用不大,停用词!
ENGLISH_STOP_WORDS
frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides',
           'between',
           'beyond',
           'bill',
           'both',
           'bottom',
           'but',
           'by',
           'call',
           'can',
           'cannot',
           'cant',
           'co',
           'con',
           'could',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'do',
           'done',
           'down',
           'due',
           'during',
           'each',
           'eg',
           'eight',
           'either',
           'eleven',
           'else',
           'elsewhere',
           'empty',
           'enough',
           'etc',
           'even',
           'ever',
           'every',
           'everyone',
           'everything',
           'everywhere',
           'except',
           'few',
           'fifteen',
           'fifty',
           'fill',
           'find',
           'fire',
           'first',
           'five',
           'for',
           'former',
           'formerly',
           'forty',
           'found',
           'four',
           'from',
           'front',
           'full',
           'further',
           'get',
           'give',
           'go',
           'had',
           'has',
           'hasnt',
           'have',
           'he',
           'hence',
           'her',
           'here',
           'hereafter',
           'hereby',
           'herein',
           'hereupon',
           'hers',
           'herself',
           'him',
           'himself',
           'his',
           'how',
           'however',
           'hundred',
           'i',
           'ie',
           'if',
           'in',
           'inc',
           'indeed',
           'interest',
           'into',
           'is',
           'it',
           'its',
           'itself',
           'keep',
           'last',
           'latter',
           'latterly',
           'least',
           'less',
           'ltd',
           'made',
           'many',
           'may',
           'me',
           'meanwhile',
           'might',
           'mill',
           'mine',
           'more',
           'moreover',
           'most',
           'mostly',
           'move',
           'much',
           'must',
           'my',
           'myself',
           'name',
           'namely',
           'neither',
           'never',
           'nevertheless',
           'next',
           'nine',
           'no',
           'nobody',
           'none',
           'noone',
           'nor',
           'not',
           'nothing',
           'now',
           'nowhere',
           'of',
           'off',
           'often',
           'on',
           'once',
           'one',
           'only',
           'onto',
           'or',
           'other',
           'others',
           'otherwise',
           'our',
           'ours',
           'ourselves',
           'out',
           'over',
           'own',
           'part',
           'per',
           'perhaps',
           'please',
           'put',
           'rather',
           're',
           'same',
           'see',
           'seem',
           'seemed',
           'seeming',
           'seems',
           'serious',
           'several',
           'she',
           'should',
           'show',
           'side',
           'since',
           'sincere',
           'six',
           'sixty',
           'so',
           'some',
           'somehow',
           'someone',
           'something',
           'sometime',
           'sometimes',
           'somewhere',
           'still',
           'such',
           'system',
           'take',
           'ten',
           'than',
           'that',
           'the',
           'their',
           'them',
           'themselves',
           'then',
           'thence',
           'there',
           'thereafter',
           'thereby',
           'therefore',
           'therein',
           'thereupon',
           'these',
           'they',
           'thick',
           'thin',
           'third',
           'this',
           'those',
           'though',
           'three',
           'through',
           'throughout',
           'thru',
           'thus',
           'to',
           'together',
           'too',
           'top',
           'toward',
           'towards',
           'twelve',
           'twenty',
           'two',
           'un',
           'under',
           'until',
           'up',
           'upon',
           'us',
           'very',
           'via',
           'was',
           'we',
           'well',
           'were',
           'what',
           'whatever',
           'when',
           'whence',
           'whenever',
           'where',
           'whereafter',
           'whereas',
           'whereby',
           'wherein',
           'whereupon',
           'wherever',
           'whether',
           'which',
           'while',
           'whither',
           'who',
           'whoever',
           'whole',
           'whom',
           'whose',
           'why',
           'will',
           'with',
           'within',
           'without',
           'would',
           'yet',
           'you',
           'your',
           'yours',
           'yourself',
           'yourselves'})

加载数据

sms = pd.read_csv('./SMSSpamCollection.csv',sep = '\t',header = None)
sms.rename({0:'label',1:'message'},inplace = True,axis = 1)# axis = 1表示我们对列进行修改
# 一个[] 数据是一维的,为什么一维的可以??交给了词频统计,一句话一句话的进行分拆,统计
# 一维的数据,便于进行for循环
X = sms['message'] # 这个不是直接交给算法,而是交给词频统计,所以给一个[]
y = sms['label']

词频统计,停用词参数的使用

drop_words = ['go','until','jurong','point','crazy','available']
cv = CountVectorizer(stop_words= drop_words)
cv.fit(X) # for循环,英语,进行拆分!!!
X_cv = cv.transform(X)
X_cv
<5572x8707 sparse matrix of type '<class 'numpy.int64'>'
    with 73832 stored elements in Compressed Sparse Row format>
cv = CountVectorizer(stop_words='english') #停用词,自己定义,给一个列表!
cv.fit(X)
X_cv = cv.transform(X)
X_cv
<5572x8444 sparse matrix of type '<class 'numpy.int64'>'
    with 43578 stored elements in Compressed Sparse Row format>

Tf-idf文本向量化方式转变

# tf term frequency 词频
# idf inverse document frequency 逆文档频率指数、你文本频率指数
# 是一种用于信息检索与数据挖掘的常用加权技术
'''Equivalent to :class:`CountVectorizer` followed by
:class:`TfidfTransformer`.'''
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tfidf = tf_idf.transform(X)
display(X_tfidf)
print(X_tfidf[:2])
<5572x8713 sparse matrix of type '<class 'numpy.float64'>'
    with 74169 stored elements in Compressed Sparse Row format>


  (0, 8548) 0.22083291550052703
  (0, 8324) 0.18241264829651851
  (0, 8084) 0.23001810878216972
  (0, 7694) 0.1555161950550194
  (0, 5958) 0.25535167546045223
  (0, 5571) 0.15602976712614566
  (0, 4501) 0.27580485521143805
  (0, 4374) 0.32647198856800297
  (0, 4114) 0.1069931616636402
  (0, 3655) 0.18034330636364296
  (0, 3615) 0.15305130991688437
  (0, 3571) 0.14787418026870422
  (0, 2338) 0.25283008183768235
  (0, 2061) 0.27580485521143805
  (0, 1767) 0.27580485521143805
  (0, 1765) 0.3116528020516887
  (0, 1316) 0.24419040033995174
  (0, 1082) 0.32647198856800297
  (1, 8450) 0.43162957585464123
  (1, 5567) 0.5466243141314314
  (1, 5538) 0.2718944069420321
  (1, 4537) 0.4083258549263009
  (1, 4342) 0.5236804332035243
cv = CountVectorizer()
tfidfTransformer = TfidfTransformer()# 词频逆文件频率转换
X1 = cv.fit_transform(X)
X2 = tfidfTransformer.fit_transform(X1)
display(X2)
print(X2[:2])
<5572x8713 sparse matrix of type '<class 'numpy.float64'>'
    with 74169 stored elements in Compressed Sparse Row format>


  (0, 8548) 0.22083291550052703
  (0, 8324) 0.18241264829651851
  (0, 8084) 0.23001810878216972
  (0, 7694) 0.1555161950550194
  (0, 5958) 0.25535167546045223
  (0, 5571) 0.15602976712614566
  (0, 4501) 0.27580485521143805
  (0, 4374) 0.32647198856800297
  (0, 4114) 0.1069931616636402
  (0, 3655) 0.18034330636364296
  (0, 3615) 0.15305130991688437
  (0, 3571) 0.14787418026870422
  (0, 2338) 0.25283008183768235
  (0, 2061) 0.27580485521143805
  (0, 1767) 0.27580485521143805
  (0, 1765) 0.3116528020516887
  (0, 1316) 0.24419040033995174
  (0, 1082) 0.32647198856800297
  (1, 8450) 0.43162957585464123
  (1, 5567) 0.5466243141314314
  (1, 5538) 0.2718944069420321
  (1, 4537) 0.4083258549263009
  (1, 4342) 0.5236804332035243

使用不同算法,进行训练预测比较

X_train,X_test,y_train,y_test = train_test_split(X_tfidf,y,random_state = 0)
mNB = MultinomialNB()
mNB.fit(X_train,y_train)
mNB.score(X_test,y_test)
0.9612347451543432
# 样本类别的比例不均衡,所以使用ROC-AUC更合适!!!
y_test.value_counts()
ham     1208
spam     185
Name: label, dtype: int64
mNB.predict(X_test)
array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')
mNB.predict_proba(X_test)[:10]
array([[0.98881241, 0.01118759],
       [0.81098774, 0.18901226],
       [0.99358361, 0.00641639],
       [0.981444  , 0.018556  ],
       [0.99295525, 0.00704475],
       [0.9956646 , 0.0043354 ],
       [0.55540814, 0.44459186],
       [0.99655669, 0.00344331],
       [0.99617511, 0.00382489],
       [0.99636481, 0.00363519]])
sKFold = StratifiedKFold(n_splits=6)
mNB = MultinomialNB()
i = 1
fpr_mean = np.linspace(0,1.0,num = 50) #横坐标
tpr_mean = np.zeros_like(fpr_mean) # 纵坐标
for train,test in sKFold.split(X_tfidf,y):
    mNB.fit(X_tfidf[train],y[train])
    y_pred = mNB.predict_proba(X_tfidf[test])[:,1]
    fpr,tpr,thresholds = roc_curve(y[test].map({'ham':0,'spam':1}),y_pred)
    tpr_mean += np.interp(fpr_mean,fpr,tpr)/6
    auc_ = auc(fpr,tpr)
    plt.plot(fpr,tpr,label = '%d-Fold auc :%0.2f'%(i,auc_),alpha = 0.4)
    i +=1
tpr_mean[0] = 0
tpr_mean[-1] = 1
auc_mean = auc(fpr_mean,tpr_mean)
plt.plot(fpr_mean,tpr_mean,label = 'Mean auc: %0.4f'%(auc_mean),color = 'red',lw = 2)
plt.legend()
<matplotlib.legend.Legend at 0x24852db9f48>
output_16_1.png
sKFold = StratifiedKFold(n_splits=6)
clf = KNeighborsClassifier()
i = 1
fpr_mean = np.linspace(0,1.0,num = 50) #横坐标
tpr_mean = np.zeros_like(fpr_mean) # 纵坐标
for train,test in sKFold.split(X_tfidf,y):
    clf.fit(X_tfidf[train],y[train])
    y_pred = clf.predict_proba(X_tfidf[test])[:,1]
    fpr,tpr,thresholds = roc_curve(y[test].map({'ham':0,'spam':1}),y_pred)
    tpr_mean += np.interp(fpr_mean,fpr,tpr)/6
    auc_ = auc(fpr,tpr)
    plt.plot(fpr,tpr,label = '%d-Fold auc :%0.2f'%(i,auc_),alpha = 0.4)
    i +=1
tpr_mean[0] = 0
tpr_mean[-1] = 1
auc_mean = auc(fpr_mean,tpr_mean)
plt.plot(fpr_mean,tpr_mean,label = 'Mean auc: %0.4f'%(auc_mean),color = 'red',lw = 2)
plt.legend()
<matplotlib.legend.Legend at 0x24852ee1288>
output_17_1.png
sKFold = StratifiedKFold(n_splits=6)
clf = LogisticRegression()
i = 1
fpr_mean = np.linspace(0,1.0,num = 50) #横坐标
tpr_mean = np.zeros_like(fpr_mean) # 纵坐标
for train,test in sKFold.split(X_tfidf,y):
    clf.fit(X_tfidf[train],y[train])
    y_pred = clf.predict_proba(X_tfidf[test])[:,1]
    fpr,tpr,thresholds = roc_curve(y[test].map({'ham':0,'spam':1}),y_pred)
    tpr_mean += np.interp(fpr_mean,fpr,tpr)/6
    auc_ = auc(fpr,tpr)
    plt.plot(fpr,tpr,label = '%d-Fold auc :%0.2f'%(i,auc_),alpha = 0.4)
    i +=1
tpr_mean[0] = 0
tpr_mean[-1] = 1
auc_mean = auc(fpr_mean,tpr_mean)
plt.plot(fpr_mean,tpr_mean,label = 'Mean auc: %0.4f'%(auc_mean),color = 'red',lw = 2)
plt.legend()
<matplotlib.legend.Legend at 0x24852dd6a08>
output_18_1.png
%%time
sKFold = StratifiedKFold(n_splits=6)
clf = ExtraTreesClassifier(n_jobs=-1)
i = 1
fpr_mean = np.linspace(0,1.0,num = 50) #横坐标
tpr_mean = np.zeros_like(fpr_mean) # 纵坐标
for train,test in sKFold.split(X_tfidf,y):
    clf.fit(X_tfidf[train],y[train])
    y_pred = clf.predict_proba(X_tfidf[test])[:,1]
    fpr,tpr,thresholds = roc_curve(y[test].map({'ham':0,'spam':1}),y_pred)
    tpr_mean += np.interp(fpr_mean,fpr,tpr)/6
    auc_ = auc(fpr,tpr)
    plt.plot(fpr,tpr,label = '%d-Fold auc :%0.2f'%(i,auc_),alpha = 0.4)
    i +=1
tpr_mean[0] = 0
tpr_mean[-1] = 1
auc_mean = auc(fpr_mean,tpr_mean)
plt.plot(fpr_mean,tpr_mean,label = 'Mean auc: %0.4f'%(auc_mean),color = 'red',lw = 2)
plt.legend()
Wall time: 14.9 s





<matplotlib.legend.Legend at 0x2484f21a108>
output_19_2.png
%%time
sKFold = StratifiedKFold(n_splits=6)
clf = GradientBoostingClassifier()# 梯度提升树,不可以并行计算
i = 1
fpr_mean = np.linspace(0,1.0,num = 50) #横坐标
tpr_mean = np.zeros_like(fpr_mean) # 纵坐标
for train,test in sKFold.split(X_tfidf,y):
    clf.fit(X_tfidf[train],y[train])
    y_pred = clf.predict_proba(X_tfidf[test])[:,1]
    fpr,tpr,thresholds = roc_curve(y[test].map({'ham':0,'spam':1}),y_pred)
    tpr_mean += np.interp(fpr_mean,fpr,tpr)/6
    auc_ = auc(fpr,tpr)
    plt.plot(fpr,tpr,label = '%d-Fold auc :%0.2f'%(i,auc_),alpha = 0.4)
    i +=1
tpr_mean[0] = 0
tpr_mean[-1] = 1
auc_mean = auc(fpr_mean,tpr_mean)
plt.plot(fpr_mean,tpr_mean,label = 'Mean auc: %0.4f'%(auc_mean),color = 'red',lw = 2)
plt.legend()
Wall time: 33.1 s





<matplotlib.legend.Legend at 0x2484e2d2588>
output_20_2.png
上一篇 下一篇

猜你喜欢

热点阅读