39. 日月光华 Python数据分析 - 机器学习 - 自然语

2023-08-17 本文已影响0人薛东弗斯
import numpy as np
import pandas as pd

data = pd.read_csv('./Tweets.csv')
data = data[['airline_sentiment', 'text']]
data.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 14640 entries, 0 to 14639
# Data columns (total 2 columns):
#  #   Column             Non-Null Count  Dtype 
# ---  ------             --------------  ----- 
#  0   airline_sentiment  14640 non-null  object
#  1   text               14640 non-null  object
# dtypes: object(2)
# memory usage: 228.9+ KB

data.airline_sentiment.unique()
# array(['neutral', 'positive', 'negative'], dtype=object)

data.airline_sentiment.value_counts()
# negative    9178
# neutral     3099
# positive    2363
# Name: airline_sentiment, dtype: int64

import re
token = re.compile(r'[A-Za-z]+|[!?.:,()]')    # 只提取英文字母和常见标点符号
def extract_text(text):
    new_text = token.findall(text)
    new_text = ' '.join([x.lower() for x in new_text])
    return new_text
x = data.text.apply(extract_text)
y = data.airline_sentiment

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)
x_train.shape, x_test.shape
# ((10980,), (3660,))

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', min_df=3)
x_train_vect = vect.fit_transform(x_train)
x_train_vect
# <10980x7749 sparse matrix of type '<class 'numpy.float64'>'
#   with 116661 stored elements in Compressed Sparse Row format>

x_test_vect = vect.transform(x_test)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train_vect, y_train)
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#            max_depth=None, max_features='auto', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
#            oob_score=False, random_state=None, verbose=0,
#            warm_start=False)

model.score(x_train_vect, y_train)
# 0.970856102003643

model.score(x_test_vect, y_test)
# 0.7344262295081967

# 使用朴素贝叶斯方法
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.0001)
model.fit(x_train_vect, y_train)
# MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)

model.score(x_train_vect, y_train)
# 0.8913479052823315    # 训练集表现不及随机森林高

model.score(x_test_vect, y_test)
# 0.7516393442622951   # 测试集表现优于随机森林，整体朴素贝叶斯优于随机森林

# 寻找最佳的alpha取值
test_score = []     
alpha_ = np.linspace(0.00001, 0.01, 100)
for a in alpha_:
    model = MultinomialNB(alpha=a)
    model.fit(x_train_vect, y_train)
    test_score.append(model.score(x_test_vect, y_test))

max_score = max(test_score)
max_score
# 0.7576502732240438

index = test_score.index(max_score)
index
# 98

alpha_[index]
# 0.00989909090909091

best_alpha = alpha_[index]
39. 日月光华 Python数据分析 - 机器学习 - 自然语

猜你喜欢

热点阅读