房价数据转换和清洗2
2018-06-27 本文已影响31人
潇洒坤
1.下载厦门房价信息源文件
下载链接:https://pan.baidu.com/s/16D5hw-XBEQnwtsf4fDJ8xw 密码:e1fg
2.编写代码
1.原来的数据总共有15列:分别为:标题title、价格price、首付downPayment、户型sizeType、面积size、单价unitPrice、朝向orientation、楼层floor、装修decoration、社区community、区域region、学校school、房屋详情houseDetail、核心卖点keySellingPoint、配套设施equipment
2.进行简单的房价预测不需要用到文本识别和语义分析,因此不需要用到title、
keySellingPoint、equipment,根据现实的情况来说因为先有单价才有总房价,
而进行预测的正是单价,所以用不到price、downPayment。观察房屋详情,发现
其中的数据有错误,有的20多层的楼房却显示没有电梯,这不符合高层住房电梯
规定,7层及以上住房必须安装电梯,不符合实际,所有房产有无电梯根据总楼层数判断
3.根据房屋的层数区分低层住宅、多层住宅、小高层住宅、高层住宅等类别
4.根据房屋的建造年代区分5年以内建造、6-10年前建造、11-15年前建造等类别
5.根据房屋所在行政区划分思明、集美、湖里、海沧、翔安、同安6个类别
6.根据房屋所在区域划分思北、莲花、厦大、中山路等类别
import pandas as pd
import re
import time
import json
def getSizeType(df):
def findNumber(reStr,sourceStr):
result_list = re.findall(reStr,sourceStr)
if len(result_list):
return result_list[0]
else:
return 0
sizeType_list = []
for i in range(len(df)):
sizeType = df['sizeType'].iloc[i]
sizeType_dict = dict(
room = findNumber('([0-9]*)室',sizeType),
hall = findNumber('([0-9]*)厅',sizeType),
restroom =findNumber('([0-9]*)卫',sizeType)
)
sizeType_list.append(sizeType_dict)
return pd.DataFrame(sizeType_list,columns=sizeType_list[0].keys())
def getPrice(df):
df1 = df['price'].copy()
for i in range(len(df1)):
df1.iloc[i] = df1.iloc[i].strip("万")
return df1
def getSize(df):
df1 = df['size'].copy()
for i in range(len(df1)):
df1.iloc[i] = df1.iloc[i].strip("平米")
return df1
def getElevator(df):
ele_list = []
for i in range(len(df)):
str1 = df['floor'].iloc[i].split(' ')[1]
allFloor = int(re.findall("共(.*)层",str1)[0])
elevator = 1 if allFloor >= 8 else 0
ele_dict = {'elevator':elevator}
ele_list.append(ele_dict)
df1 = pd.DataFrame(ele_list)
return df1
def getSchool(df):
df1 = df['school'].copy()
for i in range(len(df1)):
df1.iloc[i] = 1 if df['school'].iloc[i] == \
df['school'].iloc[i] else 0
return df1
def getHeightType(df):
df1 = df['floor'].copy()
heightType = ["低层住宅(1-3层)","多层住宅(4-7层)","小高层住宅(8-16层)",\
"中高层住宅(17-25层)","高层住宅(26-40层)","超高层住宅(40层以上)"]
for i in range(len(df1)):
str1 = df1.iloc[i].split(' ')[1]
allFloor = int(re.findall("共(.*)层",str1)[0])
if allFloor < 4:
df1.iloc[i] = heightType[0]
elif allFloor < 8:
df1.iloc[i] = heightType[1]
elif allFloor < 17:
df1.iloc[i] = heightType[2]
elif allFloor < 26:
df1.iloc[i] = heightType[3]
elif allFloor < 41:
df1.iloc[i] = heightType[4]
else:
df1.iloc[i] = heightType[5]
return pd.get_dummies(df1)
def getBuildTime(df):
df1 = df['houseDetail'].copy()
year_now = 2018
for i in range(len(df1)):
details = json.loads(df1.iloc[i])
if '建筑年代' in details:
year_build = int(details['建筑年代'].strip('年'))
else:
year_build = 2010
year_diff = year_now - year_build
if year_diff < 5:
df1.iloc[i] = '5年以内建造'
elif year_diff < 10:
df1.iloc[i] = '6-10年前建造'
elif year_diff < 15:
df1.iloc[i] = '11-15年前建造'
elif year_diff < 20:
df1.iloc[i] = '16-20年前建造'
else:
df1.iloc[i] = '超过20年前建造'
return pd.get_dummies(df1)
def getOrientation(df):
return pd.get_dummies(df['orientation'])
def getHeight(df):
df1 = df['floor'].copy()
for i in range(len(df)):
df1.iloc[i] = df['floor'].iloc[i].split(' ')[0][0]
return pd.get_dummies(df1)
def getDecoration(df):
df1 = df['decoration'].copy()
for i in range(len(df)):
df1.iloc[i] = df['decoration'].iloc[i].strip('修')
return pd.get_dummies(df1)
def getDistrict(df):
df1 = df['region'].copy()
for i in range(len(df)):
df1.iloc[i] = df['region'].iloc[i].split('-')[0]
return pd.get_dummies(df1)
def getRegion(df):
df1 = df['region'].copy()
for i in range(len(df)):
region = df['region'].iloc[i].split('-')[1]
df1.iloc[i] = region.strip('(').strip(')')
return pd.get_dummies(df1)
def cleanFloor(df):
for i in range(len(df)):
if '共' not in df['floor'].loc[i]:
df = df.drop([i])
df = df.reset_index(drop=True)
return df
def cleanSizeType(df):
for i in range(len(df)):
if '室' not in df['sizeType'].loc[i]:
df = df.drop([i])
df = df.reset_index(drop=True)
return df
def cleanCommunity(df):
df = df[df['community'] == df['community']]
df = df.reset_index(drop=True)
return df
def cleanDecoration(df):
for i in range(len(df)):
if df['decoration'].loc[i].strip() == '暂无':
df = df.drop([i])
df = df.reset_index(drop=True)
return df
def cleanOrientation(df):
for i in range(len(df)):
if df['orientation'].loc[i].strip() == '暂无':
df = df.drop([i])
df = df.reset_index(drop=True)
return df
if __name__ == "__main__":
startTime = time.time()
df = pd.read_excel("厦门房价数据(房天下版).xlsx")
df = cleanCommunity(df)
df = cleanFloor(df)
df = cleanSizeType(df)
df = cleanDecoration(df)
df = cleanOrientation(df)
df = df.drop_duplicates().reset_index(drop=True)
#下面几个字段是列数较少的字段
price = getPrice(df)
size = getSize(df)
sizeType = getSizeType(df)
elevator = getElevator(df)
school = getSchool(df)
#下面的字段是通过get_dummies方法产生的9-1矩阵,列数较多
heightType = getHeightType(df)
buildTime = getBuildTime(df)
orientaion = getOrientation(df)
height = getHeight(df)
decoration = getDecoration(df)
district = getDistrict(df)
region = getRegion(df)
df_new = pd.concat([price,size,sizeType,elevator,school,heightType,\
buildTime,orientaion,height,decoration,\
district,region],axis=1)
df_new.to_excel("厦门房价数据处理结果2.xlsx",columns = df_new.iloc[0].keys())
print("数据处理共花费%.2f秒" %(time.time()-startTime))
3.数据处理结果截图
处理结果截图.png
从上图中可以看出房屋分类可以分为81个区域,10个房屋朝向,5个装修程度。产生的新DataFrame为df_new变量,共有21502行,123列。123列中有1列为房价,为需要预测的数据,有122列为输入变量。