使用sklearn库:KNN算法

2020-05-29  本文已影响0人  还闹不闹

模型参数

def KNeighborsClassifier(n_neighbors = 5, weights='uniform', algorithm = '', leaf_size = '30', p = 2, metric = 'minkowski', metric_params = None, n_jobs = None)

案例


数据集

hua_se  huaban_yeshu    huaban_type
101 1   3
102 1   3
103 2   3
104 1   3
105 3   3
106 1   3
107 3   3
109 4   3
110 2   3
101 27  3
102 28  3
103 28  3
104 29  3
105 2   3
106 27  3
107 29  3
109 30  3
110 30  3
101 4   3
102 4   3
103 3   3
104 2   3
105 1   3
106 1   3
107 2   3
109 2   3
110 4   3
101 29  3
102 30  3
103 30  3
104 29  3
105 27  3
106 28  3
107 29  3
109 29  3
110 30  3
1   1   1
2   2   1
3   3   1
4   1   1
5   4   1
6   3   1
7   4   1
9   2   1
8   3   1
10  1   1
6   2   1
7   3   1
3   1   1
5   3   1
5   4   1
2   3   1
3   2   1
2   3   1
2   2   1
10  1   1
9   3   1
7   1   1
9   4   1
4   3   1
6   3   1
3   1   1
7   1   1
1   2   1
8   4   1
10  4   1
10  27  2
9   29  2
8   29  2
7   30  2
5   29  2
6   27  2
4   27  2
3   28  2
1   29  2
2   29  2
10  29  2
9   29  2
8   30  2
7   30  2
5   27  2
6   28  2
4   28  2
3   29  2
1   27  2
2   30  2
10  30  2
9   29  2
8   30  2
7   28  2
5   29  2
6   30  2
4   30  2
3   30  2
1   27  2
2   29  2
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer


# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000,默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)

df = pd.read_csv('G:\\rasa_demo\stack\data\\train.csv.txt', sep='\t', encoding='GBK', header=0)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent', copy=True)
# df = imp.fit_transform(df)
# df = pd.DataFrame(df)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # 检查数据中是否有缺失值,以下两种方式均可
# # Flase:对应特征的特征值中无缺失值
# # True:有缺失值
# print(df.isnull().any())
# print(np.isnan(df).any())
# print(np.isfinite(df).all())
# # # 查看缺失值记录
# # df_null = pd.isnull(df)
# # df_null = df[df_null == True]
# # print(df_null)
# # 缺失值处理,以下两种方式均可
# # 删除包含缺失值的行
# df.dropna(inplace=True)
# # # 缺失值填充
# # df.fillna('10.0')
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# print(df.columns)
# print(df[['hua_se','huaban_yeshu']])
# print(df.iloc[:,[0,1]])
print(df.iloc[0:3,[2]])
X = df.iloc[:,[0,1]]
Y = df.iloc[:,[2]]
print(X.shape, Y.shape)
# Y = Y.values.reshape(-1,1)
print(Y.values.ravel())
X_train,X_test,Y_train,Y_test = train_test_split(X, Y.values.ravel(), test_size=0.2, random_state=35)
# print(type(X_test))

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)

result = knn.predict(X_test)
count = 0
for item1,item2 in zip(result, Y_test):
    if item1 == item2:
        count += 1
print('准确率:', float(count)/float(len(Y_test)))

# # 保存训练好的模型
# joblib.dump(knn, 'G:\\rasa_demo\stack\model\model.pkl') # pkl是sklearn默认的保存格式
# print('模型保存成功!')
# # 加载已训练好的模型
# knn_model = joblib.load('G:\\rasa_demo\stack\model\model.pkl')
# pred_y = knn_model.predict(X_test)

pred_x = {'hua_se':12,'huaban_yeshu':27}
tmp = pd.DataFrame(pred_x, index=[0])
print(tmp)
print(knn.predict(tmp))

拓展阅读:
http://www.statr.cn/?p=351

上一篇 下一篇

猜你喜欢

热点阅读