[Python与数据分析]-11LOF
2020-08-06 本文已影响0人
六千宛
@官方
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from scipy import stats
# 构造训练样本
n_samples = 200 #样本总数
outliers_fraction = 0.25 #异常样本比例
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
rng = np.random.RandomState(42)
X = 0.3 * rng.randn(n_inliers // 2, 2)
X_train = np.r_[X + 2, X - 2] #正常样本
X_train = np.r_[X_train, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] #正常样本加上异常样本
# fit the model
clf = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)
y_pred = clf.fit_predict(X_train)
scores_pred = clf.negative_outlier_factor_
threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction) # 根据异常样本比例,得到阈值,用于绘 图
# plot the level sets of the decision function
xx, yy = np.meshgrid(np.linspace(-7, 7, 50), np.linspace(-7, 7, 50))
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) # 类似scores_pred的值,值越小越有可能是异常点
Z = Z.reshape(xx.shape)
plt.title("Local Outlier Factor (LOF)")
# plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # 绘制异常点区域,值从最小的到阈值的那部分
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # 绘制异常点区域和正常点区域的边界
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='palevioletred') # 绘制正常点区域,值从阈值到最大的那部分
b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',
s=20, edgecolor='k')
c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',
s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-7, 7))
plt.ylim((-7, 7))
plt.legend([a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
loc="upper left")
plt.show()
图片.png
anthor:usst2019zp_l@163.com
#########################LOF_NJ#########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################LOF异常检测####################
#1-替换样本特征中出现的inf,nan
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from scipy import stats
# fit the model
b[np.isinf(b)]=0
b[np.isnan(b)]=0
model = LocalOutlierFactor(n_neighbors=2518, contamination=float(0.1))
y_pred = model.fit_predict(b)
scores_pred = model.negative_outlier_factor_
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=scores_pred
g['anomaly']=y_pred
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('NJ_LOF_tsfresh.xlsx')