39. 日月光华 Python数据分析 - 机器学习 - 自然语
2023-08-17 本文已影响0人
薛东弗斯
import numpy as np
import pandas as pd
data = pd.read_csv('./Tweets.csv')
data = data[['airline_sentiment', 'text']]
data.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 14640 entries, 0 to 14639
# Data columns (total 2 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 airline_sentiment 14640 non-null object
# 1 text 14640 non-null object
# dtypes: object(2)
# memory usage: 228.9+ KB
data.airline_sentiment.unique()
# array(['neutral', 'positive', 'negative'], dtype=object)
data.airline_sentiment.value_counts()
# negative 9178
# neutral 3099
# positive 2363
# Name: airline_sentiment, dtype: int64
import re
token = re.compile(r'[A-Za-z]+|[!?.:,()]') # 只提取英文字母和常见标点符号
def extract_text(text):
new_text = token.findall(text)
new_text = ' '.join([x.lower() for x in new_text])
return new_text
x = data.text.apply(extract_text)
y = data.airline_sentiment
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)
x_train.shape, x_test.shape
# ((10980,), (3660,))
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', min_df=3)
x_train_vect = vect.fit_transform(x_train)
x_train_vect
# <10980x7749 sparse matrix of type '<class 'numpy.float64'>'
# with 116661 stored elements in Compressed Sparse Row format>
x_test_vect = vect.transform(x_test)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train_vect, y_train)
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
# max_depth=None, max_features='auto', max_leaf_nodes=None,
# min_impurity_decrease=0.0, min_impurity_split=None,
# min_samples_leaf=1, min_samples_split=2,
# min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
# oob_score=False, random_state=None, verbose=0,
# warm_start=False)
model.score(x_train_vect, y_train)
# 0.970856102003643
model.score(x_test_vect, y_test)
# 0.7344262295081967
# 使用朴素贝叶斯方法
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.0001)
model.fit(x_train_vect, y_train)
# MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)
model.score(x_train_vect, y_train)
# 0.8913479052823315 # 训练集表现不及随机森林高
model.score(x_test_vect, y_test)
# 0.7516393442622951 # 测试集表现优于随机森林,整体朴素贝叶斯优于随机森林
# 寻找最佳的alpha取值
test_score = []
alpha_ = np.linspace(0.00001, 0.01, 100)
for a in alpha_:
model = MultinomialNB(alpha=a)
model.fit(x_train_vect, y_train)
test_score.append(model.score(x_test_vect, y_test))
max_score = max(test_score)
max_score
# 0.7576502732240438
index = test_score.index(max_score)
index
# 98
alpha_[index]
# 0.00989909090909091
best_alpha = alpha_[index]