02房价统计

2022-12-13  本文已影响0人  Jachin111

数据EDA

# 导入库
import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")

from scipy.stats import norm
from scipy import stats
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 导入数据
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()
image.png
# 数据信息
train.shape
image.png
train.isnull().sum().sort_values()
image.png
# 描述统计信息
train.describe()
image.png

销售价格SalePrice分析

# 统计信息
train["SalePrice"].describe()
image.png
sns.distplot(train['SalePrice'])
plt.show()
image.png
# 偏度和峰度
print("Skewness(偏度):%f"%train['SalePrice'].skew())
print("Kurtosis(峰度):%f"%train['SalePrice'].kurt())
# image.png
# SalePrice和数值型字段的关系
data = train[["SalePrice","GrLivArea"]]
data.head()
image.png
plt.figure(1,figsize=(12,6))
sns.scatterplot(x="GrLivArea",y="SalePrice",data=data)
plt.show()
image.png
data = train[["SalePrice","TotalBsmtSF"]]

plt.figure(1,figsize=(12,6))
sns.scatterplot(x="TotalBsmtSF",y="SalePrice",data=data)
plt.show()
image.png
# 价格和分类型字段的关系
train["OverallQual"].value_counts()
image.png
data = train[["SalePrice","OverallQual"]]

f,ax = plt.subplots(1,figsize=(12,6))
fig = sns.boxplot(x="OverallQual",y="SalePrice",data=data)

fig.axis(ymin=0,ymax=800000)
plt.show()
image.png
data = train[["SalePrice","YearBuilt"]]

f,ax = plt.subplots(1,figsize=(16,8))
fig = sns.boxplot(x="YearBuilt",y="SalePrice",data=data)

fig.axis(ymin=0,ymax=800000)
plt.show()
image.png

相关性分析

# 整体相关性
corrmat = train.corr()
corrmat.head()
image.png
f,ax = plt.subplots(figsize=(12,6))
sns.heatmap(corrmat,vmax=0.8,square=True)
plt.show()
image.png
# 缩放相关矩阵
k = 10
corrmat.nlargest(k,"SalePrice")["SalePrice"].index
image.png
k = 10
corrmat.nlargest(k,"SalePrice")
image.png
cols = corrmat.nlargest(k,"SalePrice").index
cols
image.png
cm = np.corrcoef(train[cols].values.T)
cm[:3]
image.png
sns.set(font_scale=1.25)
hm = sns.heatmap(cm,
                cbar=True,
                annot=True,
                square=True,
                fmt='.2f',
                annot_kws={'size':10},
                xticklabels=cols.values,
                yticklabels=cols.values)

plt.show()
image.png
# 变量离散图
cols = ['SalePrice','OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt']
sns.pairplot(train[cols],size=2.5)
plt.show()
image.png

缺失值处理

# 缺失值占比
total = train.isnull().sum().sort_values(ascending=False)
total.head()
image.png
# 转成百分比
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
percent.head()
image.png
# 数据合并,整体的缺失值情况
missing_data = pd.concat([total,percent],axis=1,keys=["Total","Percent"])
missing_data.head()
image.png
# 删除缺失值
missing_data[missing_data["Total"]>1].index
image.png
train = train.drop(missing_data[missing_data["Total"]>1].index,1)
train = train.drop(train.loc[train["Electrical"].isnull()].index)
train.isnull().sum().max()
image.png

离群点out liars

# 查找离群点
saleprice_scaled = StandardScaler().fit_transform(train["SalePrice"][:,np.newaxis])
saleprice_scaled[:5]
image.png
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]

print(low_range)
print('---------------')
print(high_range)
image.png
# 单变量分析1
data = train[["SalePrice","GrLivArea"]]
data.plot.scatter(x="GrLivArea",y="SalePrice",ylim=(0,800000))
plt.show()
image.png
# 删除离群点
train.sort_values(by='GrLivArea',ascending=False)[:2]
image.png
train = train.drop(train[train['Id']==1299].index)
train = train.drop(train[train['Id']==524].index)
train.head()
image.png
# 单变量分析2
data = train[["SalePrice","TotalBsmtSF"]]
data.plot.scatter(x="TotalBsmtSF",y="SalePrice",ylim=(0,800000))
plt.show()
image.png

深入理解SalePrice

# Normality归一化(SalePrice)
sns.distplot(train["SalePrice"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["SalePrice"],plot=plt)
image.png
# 对数变换
train["SalePrice"] = np.log(train["SalePrice"])

sns.distplot(train["SalePrice"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["SalePrice"],plot=plt)
image.png
# Normality归一化(GrLivArea)
sns.distplot(train["GrLivArea"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["GrLivArea"],plot=plt)
image.png
# 执行对数操作
train["GrLivArea"] = np.log(train["GrLivArea"])

sns.distplot(train["GrLivArea"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["GrLivArea"],plot=plt)
image.png
# Normality归一化(TotalBsmtSF)
sns.distplot(train["TotalBsmtSF"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["TotalBsmtSF"],plot=plt)
image.png
train['HasBsmt'] = 0

# 当TotalBsstSF>0 则赋值1
train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1

# 对数转换:等于1的部分
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])

data = train[train['TotalBsmtSF']>0]['TotalBsmtSF']
sns.distplot(data,fit=norm)
fig = plt.figure()
res = stats.probplot(data,plot=plt)
image.png

同方差性

# 讨论'SalePrice'和'GrLivArea'之间的关系
plt.scatter(train['GrLivArea'],train['SalePrice'])
plt.show()
image.png
# 讨论'SalePrice'和'TotalBsmtSF'之间的关系
data = train[train['TotalBsmtSF']>0]

plt.scatter(data['TotalBsmtSF'],data['SalePrice'])
plt.show()
image.png

生成哑变量

train = pd.get_dummies(train)
train
image.png
上一篇下一篇

猜你喜欢

热点阅读