2019-08-12
train_data=pd.read_csv(train_path)
train_data.columns
test_data.head()
import pandas_profiling as ppf ##探索性数据分析(EDA)
ppf.ProfileReport(train_data)
train_data['SalePrice'].describe()
print('train_data_skew:%f'%train_data['SalePrice'].skew() ) #偏度
print('train_data_kurt:%f'%train_data['SalePrice'].kurt()) #峰态
import seaborn as sns
sns.distplot(train_data['SalePrice'])
import matplotlib.pyplot as plt #变量关系图 -线性关系
var='GrLivArea'
data=pd.concat([train_data['SalePrice'],train_data[var]],axis=1)
data.plot.scatter(x=var,y='SalePrice',ylim=(0,800000))
var = 'OverallQual' #箱型图-异常值
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
#删除异常值
train_data.drop(train_data[(train_data['GrLivArea']>4000)&(train_data['SalePrice']<300000)].index,inplace=True)
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
print('cm>>',cm)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
数据清洗:空值的填充,删除,不处理
miss=full.isnull().sum()#统计空值的个数
miss[miss>0].sort_values(ascending=True)
对object,int,float分别填充:
full[col].fillna('None',inplace=True)
full['LotFrontage'].fillna(np.mean(full['LotFrontage']),inplace=True)