2019-08-12

2019-08-13  本文已影响0人  发现一个喜悦的地方

train_data=pd.read_csv(train_path)

train_data.columns

test_data.head()


import pandas_profiling as ppf ##探索性数据分析(EDA)

ppf.ProfileReport(train_data)


train_data['SalePrice'].describe()

print('train_data_skew:%f'%train_data['SalePrice'].skew() ) #偏度

print('train_data_kurt:%f'%train_data['SalePrice'].kurt())  #峰态


import seaborn as sns

sns.distplot(train_data['SalePrice'])


import matplotlib.pyplot as plt #变量关系图 -线性关系

var='GrLivArea'

data=pd.concat([train_data['SalePrice'],train_data[var]],axis=1)

data.plot.scatter(x=var,y='SalePrice',ylim=(0,800000))


var = 'OverallQual'           #箱型图-异常值

data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)

f, ax = plt.subplots(figsize=(8, 6))

fig = sns.boxplot(x=var, y="SalePrice", data=data)

fig.axis(ymin=0, ymax=800000);

#删除异常值

train_data.drop(train_data[(train_data['GrLivArea']>4000)&(train_data['SalePrice']<300000)].index,inplace=True)


corrmat = df_train.corr()

f, ax = plt.subplots(figsize=(12, 9))

sns.heatmap(corrmat, vmax=.8, square=True);

k = 10  #number of variables for heatmap

cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index

cm = np.corrcoef(df_train[cols].values.T)

print('cm>>',cm)

sns.set(font_scale=1.25)

hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

plt.show()


数据清洗:空值的填充,删除,不处理

miss=full.isnull().sum()#统计空值的个数

miss[miss>0].sort_values(ascending=True)

对object,int,float分别填充:

        full[col].fillna('None',inplace=True)

        full['LotFrontage'].fillna(np.mean(full['LotFrontage']),inplace=True)

        

上一篇下一篇

猜你喜欢

热点阅读