sklearn之数据预处理
2022-06-18 本文已影响0人
万州客
image.png《机器学习:从入门到入职》这本书讲得不错,代码撸起来也比较顺。作个记录。
一,代码
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk import word_tokenize
from sklearn.feature_extraction import image
# 数据预处理
# #缺失值处理
# ##均值,中位数,常数, 最高频数
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_constant = SimpleImputer(missing_values=np.nan, strategy='constant')
imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X = [[13, 22], [5, 3], [7, np.nan], [np.nan, 5], [3, 7]]
imp_mean.fit(X)
imp_median.fit(X)
imp_constant.fit(X)
imp_most_frequent.fit(X)
print('均值处理,结果如下:')
print(imp_mean.transform(X))
print('中位数处理,结果如下:')
print(imp_median.transform(X))
print('常数处理,结果如下:')
print(imp_constant.transform(X))
print('最高频处理,结果如下:')
print(imp_most_frequent.transform(X))
# #数据规范化
# ##缩放规范化
# ###最小值最大值缩放(Min-Max Scaling)
min_max_scaler = preprocessing.MinMaxScaler()
X_min_max = min_max_scaler.fit_transform(X)
print('缩放规范化结果如下:')
print(X_min_max)
print('输出其缩放倍数:')
print(min_max_scaler.scale_)
print('输出每一列的最小调整:')
print(min_max_scaler.min_)
print('输出每一列的最小值:')
print(min_max_scaler.data_min_)
min_max_scaler_c = preprocessing.MinMaxScaler(feature_range=(-2, 6))
X_min_max_c = min_max_scaler_c.fit_transform(X)
print('自定义缩放范围(-2,6):')
print(X_min_max_c)
# ###最大绝对值缩放(Max Absolute Scaling)
max_abs_scaler = preprocessing.MaxAbsScaler()
X_max_abs = max_abs_scaler.fit_transform(X)
print('最大绝对值缩放:')
print(X_max_abs)
# ##标准化-标准差规范化-Z标准化(Z-Score Normalization)
standard_scaler = preprocessing.StandardScaler()
X_standard = standard_scaler.fit_transform(X)
print('标准差规范化之后的值:')
print(X_standard)
print('输出标准化变换之后的均值:')
print(X_standard.mean(axis=0))
print('输出标准化之后的标准差:')
print(X_standard.std(axis=0))
X = np.array([[3, -2., 2.], [2., 0., 0.], [-1, 1., 3.]])
# ##范数规范化
# ###函数实现
X_norm = preprocessing.normalize(X, norm='l2')
print('范数L2规范化变换之后的输出:')
print(X_norm)
# ###类实现
normalizer = preprocessing.Normalizer(norm='l1').fit(X)
print('范数L1规范化变换之后的输出:')
print(normalizer.transform(X))
# #非线性变换
# ##二值化变换(Binary Transform)
binarizer = preprocessing.Binarizer(threshold=1)
X_binarizer = binarizer.fit_transform(X)
print('二值化变换后的输出:')
print(X_binarizer)
# ##分位数变换(Quantile)
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.fit_transform(X_test)
print('被转化训练集的五分位数:')
print(np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]))
print('被转化训练集中的最大值,对应五分位数中的第五位:')
print(max(X_train[:, 0]))
print('被转化训练集中的中位值,对应五分位数中的第三位:')
print(np.median(X_train[:, 0]))
print('被转化训练庥中的最小值,对应五分位数中的第一位:')
print(min(X_train[:, 0]))
# ##幂变换(Power Transformation)Box-Cox Yeo-Johnson
# ##多项式变换(Polynomial Transform)
X = np.arange(6).reshape(3, 2)
print(X)
poly = PolynomialFeatures()
print('二次转化之后的数据如下:')
print(poly.fit_transform(X))
poly = PolynomialFeatures(degree=3, interaction_only=True)
print('三次转化且只保留交互项之后的数据如下:')
print(poly.fit_transform(X))
# #自定义预处理
def customer_function(x):
return x * x - 2 *x + 1
transformer = FunctionTransformer(customer_function, validate=True)
print('自定义转换之后的数据为:')
print(transformer.transform(X))
# #非结构性数据预处理
# #文本数据处理
# ##分词技术
seg_list = jieba.cut('机器学习从入门到入职', cut_all=True)
print('全模式分词输出如下:')
print(','.join(seg_list))
seg_list = jieba.cut('机器学习从入门到入职', cut_all=False)
print('默认模式为精确模式:')
print(','.join(seg_list))
list = jieba.lcut('机器学习从入门到入职')
print('切割最后生成list,如下:')
print(list)
# ##对已提取数据的处理
# ###独热编码 One-Hot Encoding
city_dict = [{'city': 'beijing'}, {'city': 'wuhan'}, {'city': 'shenzhen'}]
vec = DictVectorizer()
print('将字典拟合再向量化之后,输出的结果如下:')
print(vec.fit_transform(city_dict).toarray())
print('将新的字典用之前生成的向量器转化,输出的结果如下:')
city_dict_new = [{'city': 'beijing'}, {'city': 'wuhan'}, {'city': 'shenzhen'}]
print(vec.transform(city_dict_new).toarray())
# ###哈希技巧
text_all = ['A man came home form work late, tired and found his 5 years old son waiting for him at the door. ']
text_1 = ['A man came home form work late']
text_2 = ['tired and found his 5 years old son waiting for him at the door']
text_fake = ['A good book may be among the best of friends']
vectorizer = CountVectorizer()
hv_positive = FeatureHasher(n_features=6, input_type='string')
hv = FeatureHasher(n_features=6, input_type='string')
text_name = ['text_1', 'text_2', 'text_fake']
def show_transform_code(transformer, text_list=[text_1, text_2,text_fake]):
transformer = transformer.fit(text_all)
dist = [transformer.transform(x).toarray()[0] for x in text_list]
for j, i in enumerate(dist):
if j == 1:
print('text_2转化之后的编码如下:')
print(dist[0])
continue
ed = euclidean_distances([dist[1]], [i])
print('{}转化之后的编码如下:'.format(text_name[j]))
print(i)
print('{}与text_2之间的欧氏距离为:'.format(text_name[j]))
print(ed[0][0])
print('------------------------')
print('=======================================')
print('词频统计变换:')
show_transform_code(vectorizer)
print('无符号特征哈希变换:')
show_transform_code(hv_positive)
print('有符号特征哈希变换:')
show_transform_code(hv)
# ##文本的特征提取
# ###词频向量化
text_all = [
'A man came home form work late, tired and found his 5 years old son waiting for him at the door. ',
'A good book may be among the best of friends'
]
vectorizer_origin = CountVectorizer()
vectorizer_analyzer = CountVectorizer(analyzer='char')
vectorizer_stop = CountVectorizer(stop_words=['is', 'I', 'it'])
vectorizer_counter_filter = CountVectorizer(max_df=0.5, min_df=0.1)
vectorizer_ngram_range = CountVectorizer(ngram_range=(1, 3), stop_words = ['is', 'I', 'it', 'good'])
strategy = {
'原始词频统计': vectorizer_origin,
'基于字符的词频统计': vectorizer_analyzer,
'带有停止词的频统计': vectorizer_stop,
'过滤高低词频的词频统计': vectorizer_counter_filter,
'设置N-gram的词频统计': vectorizer_ngram_range
}
def show_transformed_data(strategy=strategy):
for k, v in strategy.items():
v.fit(text_all)
print('{}的特征如下:'.format(k))
print(v.get_feature_names())
show_transformed_data()
# ###词频-逆文档频率法 TF-IDF
transformer = TfidfTransformer(smooth_idf=False)
tfidVector = TfidfVectorizer(smooth_idf=False)
text = ['this is the dog']
print('使用TFID转换器实现数据提取')
# print(transformer.fit_transform(text))
print('使用TFID向量化器实现数据提取')
print(tfidVector.fit_transform(text))
# ###自定义向量器
def my_tokenizer(s):
return jieba.lcut(s)
def my_preprocessor(s):
s = s.replace('机器', '深度')
return s
vectorizer_origin = CountVectorizer()
vectorizer_custom = CountVectorizer(tokenizer=my_tokenizer)
vectorizer_custom_preprocess = CountVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer)
vectorizer_origin.fit(['机器学习从入门到入职'])
vectorizer_custom.fit(['机器学习从入门到入职'])
vectorizer_custom_preprocess.fit(['机器学习从入门到入职'])
print('原始词频统计向量器输出如下:')
print(vectorizer_origin.get_feature_names())
print('自定义tokenizer词频统计向量器输出如下:')
print(vectorizer_custom.get_feature_names())
print('自定义preprocessor词频统计向量器输出如下:')
print(vectorizer_custom_preprocess.get_feature_names())
# #图形的和特征提取
one_image = np.arange(3 * 3 * 3).reshape((3, 3, 3))
print('大小为3*3*3的彩色图片的黄色通道的输出如下:')
print(one_image[:, :, 1])
patcher = image.extract_patches_2d(one_image, (2, 2), max_patches=4, random_state=0)
print('大小为2*2的彩色补丁图片的黄色通道的输出如下:')
print(patcher[:, :, 1])
二,输出
C:\Users\ccc\AppData\Local\Programs\Python\Python38\python.exe D:/tmp/tup_ai/codes/2.clustering/kmeans/preprocess_test.py
均值处理,结果如下:
[[13. 22. ]
[ 5. 3. ]
[ 7. 9.25]
[ 7. 5. ]
[ 3. 7. ]]
中位数处理,结果如下:
[[13. 22.]
[ 5. 3.]
[ 7. 6.]
[ 6. 5.]
[ 3. 7.]]
常数处理,结果如下:
[[13. 22.]
[ 5. 3.]
[ 7. 0.]
[ 0. 5.]
[ 3. 7.]]
最高频处理,结果如下:
[[13. 22.]
[ 5. 3.]
[ 7. 3.]
[ 3. 5.]
[ 3. 7.]]
缩放规范化结果如下:
[[1. 1. ]
[0.2 0. ]
[0.4 nan]
[ nan 0.10526316]
[0. 0.21052632]]
输出其缩放倍数:
[0.1 0.05263158]
输出每一列的最小调整:
[-0.3 -0.15789474]
输出每一列的最小值:
[3. 3.]
自定义缩放范围(-2,6):
[[ 6. 6. ]
[-0.4 -2. ]
[ 1.2 nan]
[ nan -1.15789474]
[-2. -0.31578947]]
最大绝对值缩放:
[[1. 1. ]
[0.38461538 0.13636364]
[0.53846154 nan]
[ nan 0.22727273]
[0.23076923 0.31818182]]
标准差规范化之后的值:
[[ 1.60356745 1.70094523]
[-0.53452248 -0.83379668]
[ 0. nan]
[ nan -0.56698174]
[-1.06904497 -0.30016681]]
输出标准化变换之后的均值:
[nan nan]
输出标准化之后的标准差:
[nan nan]
范数L2规范化变换之后的输出:
[[ 0.72760688 -0.48507125 0.48507125]
[ 1. 0. 0. ]
[-0.30151134 0.30151134 0.90453403]]
范数L1规范化变换之后的输出:
[[ 0.42857143 -0.28571429 0.28571429]
[ 1. 0. 0. ]
[-0.2 0.2 0.6 ]]
二值化变换后的输出:
[[1. 0. 1.]
[1. 0. 0.]
[0. 0. 1.]]
C:\Users\ccc\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\preprocessing\_data.py:2583: UserWarning: n_quantiles (1000) is greater than the total number of samples (112). n_quantiles is set to n_samples.
warnings.warn(
C:\Users\ccc\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\preprocessing\_data.py:2583: UserWarning: n_quantiles (1000) is greater than the total number of samples (38). n_quantiles is set to n_samples.
warnings.warn(
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ccc\AppData\Local\Temp\jieba.cache
被转化训练集的五分位数:
[4.3 5.1 5.8 6.5 7.9]
被转化训练集中的最大值,对应五分位数中的第五位:
7.9
被转化训练集中的中位值,对应五分位数中的第三位:
5.8
被转化训练庥中的最小值,对应五分位数中的第一位:
4.3
[[0 1]
[2 3]
[4 5]]
二次转化之后的数据如下:
[[ 1. 0. 1. 0. 0. 1.]
[ 1. 2. 3. 4. 6. 9.]
[ 1. 4. 5. 16. 20. 25.]]
三次转化且只保留交互项之后的数据如下:
[[ 1. 0. 1. 0.]
[ 1. 2. 3. 6.]
[ 1. 4. 5. 20.]]
自定义转换之后的数据为:
[[ 1 0]
[ 1 4]
[ 9 16]]
全模式分词输出如下:
Loading model cost 0.605 seconds.
Prefix dict has been built successfully.
C:\Users\ccc\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
warnings.warn(msg, category=FutureWarning)
C:\Users\ccc\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\feature_extraction\text.py:524: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
warnings.warn(
机器,学习,从,入门,到,入,职
默认模式为精确模式:
机器,学习,从,入门,到,入职
切割最后生成list,如下:
['机器', '学习', '从', '入门', '到', '入职']
将字典拟合再向量化之后,输出的结果如下:
[[1. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]]
将新的字典用之前生成的向量器转化,输出的结果如下:
[[1. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]]
词频统计变换:
text_1转化之后的编码如下:
[0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0]
text_1与text_2之间的欧氏距离为:
4.358898943540674
------------------------
text_2转化之后的编码如下:
[0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0]
text_fake转化之后的编码如下:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
text_fake与text_2之间的欧氏距离为:
3.4641016151377544
------------------------
=======================================
无符号特征哈希变换:
text_1转化之后的编码如下:
[-1. 7. 10. -1. 5. -2.]
text_1与text_2之间的欧氏距离为:
16.703293088490067
------------------------
text_2转化之后的编码如下:
[-1. 7. 10. -1. 5. -2.]
text_fake转化之后的编码如下:
[-2. 13. 9. -5. 4. -3.]
text_fake与text_2之间的欧氏距离为:
11.61895003862225
------------------------
=======================================
有符号特征哈希变换:
text_1转化之后的编码如下:
[-1. 7. 10. -1. 5. -2.]
text_1与text_2之间的欧氏距离为:
16.703293088490067
------------------------
text_2转化之后的编码如下:
[-1. 7. 10. -1. 5. -2.]
text_fake转化之后的编码如下:
[-2. 13. 9. -5. 4. -3.]
text_fake与text_2之间的欧氏距离为:
11.61895003862225
------------------------
=======================================
原始词频统计的特征如下:
['among', 'and', 'at', 'be', 'best', 'book', 'came', 'door', 'for', 'form', 'found', 'friends', 'good', 'him', 'his', 'home', 'late', 'man', 'may', 'of', 'old', 'son', 'the', 'tired', 'waiting', 'work', 'years']
基于字符的词频统计的特征如下:
[' ', ',', '.', '5', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y']
带有停止词的频统计的特征如下:
['among', 'and', 'at', 'be', 'best', 'book', 'came', 'door', 'for', 'form', 'found', 'friends', 'good', 'him', 'his', 'home', 'late', 'man', 'may', 'of', 'old', 'son', 'the', 'tired', 'waiting', 'work', 'years']
过滤高低词频的词频统计的特征如下:
['among', 'and', 'at', 'be', 'best', 'book', 'came', 'door', 'for', 'form', 'found', 'friends', 'good', 'him', 'his', 'home', 'late', 'man', 'may', 'of', 'old', 'son', 'tired', 'waiting', 'work', 'years']
设置N-gram的词频统计的特征如下:
['among', 'among the', 'among the best', 'and', 'and found', 'and found his', 'at', 'at the', 'at the door', 'be', 'be among', 'be among the', 'best', 'best of', 'best of friends', 'book', 'book may', 'book may be', 'came', 'came home', 'came home form', 'door', 'for', 'for him', 'for him at', 'form', 'form work', 'form work late', 'found', 'found his', 'found his years', 'friends', 'him', 'him at', 'him at the', 'his', 'his years', 'his years old', 'home', 'home form', 'home form work', 'late', 'late tired', 'late tired and', 'man', 'man came', 'man came home', 'may', 'may be', 'may be among', 'of', 'of friends', 'old', 'old son', 'old son waiting', 'son', 'son waiting', 'son waiting for', 'the', 'the best', 'the best of', 'the door', 'tired', 'tired and', 'tired and found', 'waiting', 'waiting for', 'waiting for him', 'work', 'work late', 'work late tired', 'years', 'years old', 'years old son']
使用TFID转换器实现数据提取
使用TFID向量化器实现数据提取
(0, 0) 0.5
(0, 2) 0.5
(0, 1) 0.5
(0, 3) 0.5
原始词频统计向量器输出如下:
['机器学习从入门到入职']
自定义tokenizer词频统计向量器输出如下:
['从', '入职', '入门', '到', '学习', '机器']
自定义preprocessor词频统计向量器输出如下:
['从', '入职', '入门', '到', '学习', '深度']
大小为3*3*3的彩色图片的黄色通道的输出如下:
[[ 1 4 7]
[10 13 16]
[19 22 25]]
大小为2*2的彩色补丁图片的黄色通道的输出如下:
[[[ 6 7 8]
[15 16 17]]
[[15 16 17]
[24 25 26]]
[[15 16 17]
[24 25 26]]
[[ 6 7 8]
[15 16 17]]]
Process finished with exit code 0