天池短租数据简单分析

2020-09-09  本文已影响0人  butters001

导入下面代码所需依赖包

import math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

加载数据

source_data = pd.read_csv('./source_data/listings.csv')
# 查看数据
source_data.head()
image.png

每一列的数据类型

source_data.dtypes
image.png

修改列名 删除两列

source_data.columns = ['id', 'name', '房主id', '房主名', '行政区组', '行政区名', '纬度', '经度', '房屋类型', '价格', '最低入住天数', '评论数量', '最后评论时间', '每月评论数', 'calculated_host_listings_count', '可提供天数365']

del source_data['行政区组']
del source_data['calculated_host_listings_count']

按行政区分组 查看每个区的房屋出租数量

source_data.loc[:, ['行政区名']].groupby('行政区名').size()
image.png

行政区名 统一命名某某区 对房屋类型进行汉化

for neighbourhood in source_data['行政区名'].unique():
    split_res = neighbourhood.split('/')
    if len(split_res) == 2:
        source_data.replace(neighbourhood, split_res[0].strip(), inplace=True)
        
source_data['行政区名'].replace('县', '区', regex=True, inplace=True)

source_data.replace('Entire home/apt', '整套房子/公寓', inplace=True)
source_data.replace('Private room', '私人房间', inplace=True)
source_data.replace('Shared room', '共享房间', inplace=True)

source_data
image.png

再次查看每个区的房屋数量

neighbourhood_group_count = source_data.loc[:, ['行政区名']].groupby('行政区名').size()
neighbourhood_group_count
image.png

每个区房源数量绘图

data = go.Bar(name='北京市房源数量分布图', 
              x=neighbourhood_group_count.index.values, 
              y=neighbourhood_group_count.values, 
              text=neighbourhood_group_count.values,
              texttemplate='%{text:.3s}',
              textposition='outside')
fig = go.Figure(data=data)

# 降序
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
image.png

每个区的房源价格箱型图

fig = go.Figure()
for neighbourhood in source_data['行政区名'].unique():
    y = source_data[source_data['行政区名']==neighbourhood]['价格'].values
    box = go.Box(y=y, name=neighbourhood)
    fig.add_trace(box)
fig.show()
image.png

剔除异常值,重新绘制箱形图 异常值设置为一万以上

# 获取价格一万以内的数据
price_in_10k = source_data[source_data['价格'] <= 10000]

fig = go.Figure()
for neighbourhood in price_in_10k['行政区名'].unique():
    y = price_in_10k[price_in_10k['行政区名']==neighbourhood]['价格'].values
    box = go.Box(y=y, name=neighbourhood)
    fig.add_trace(box)
fig.show()
image.png

感觉异常值还是多 设置4000以下看看

print('4000以内价格的房屋数量: %s ' % source_data[source_data['价格'] <= 4000].shape[0])
print('4000以外价格的房屋数量: %s ' % source_data[source_data['价格'] > 4000].shape[0])

# 获取价格四千以内的数据
price_in_4k = source_data[source_data['价格'] <= 4000]
fig = go.Figure()
for neighbourhood in price_in_4k['行政区名'].unique():
    y = price_in_4k[price_in_4k['行政区名']==neighbourhood]['价格'].values
    box = go.Box(y=y, name=neighbourhood)
    fig.add_trace(box)
fig.show()
image.png

密云 怀柔 延庆价格怎么这么高。。。单拿出来观察一下

print('密云', source_data[source_data['行政区名'] == '密云区']['价格'].describe(), '\r\n')
print('朝阳', source_data[source_data['行政区名'] == '朝阳区']['价格'].describe(), '\r\n')
print('怀柔', source_data[source_data['行政区名'] == '怀柔区']['价格'].describe())
image.png

查看一下房屋类型占比

# 先按行政区进行分组 再按房屋类型进行分组
groupby_neighbourhood_price = source_data.loc[:, ['行政区名', '房屋类型', 'id']].groupby(['行政区名', '房屋类型']).count()
groupby_neighbourhood_price
image.png

重制索引

groupby_neighbourhood_price.reset_index(inplace=True)
neighbourhoods = groupby_neighbourhood_price['行政区名'].unique()
rows_num = math.ceil(len(neighbourhoods)/3)
cols_num=3

fig = make_subplots(rows_num, cols_num, 
                    specs=[[{'type':'domain'}]*3]*rows_num, 
                    subplot_titles=neighbourhoods)

labels = ['整套房子/公寓', '私人房间', '共享房间']

row = 0
col = 1
for index, neighbourhood in enumerate(neighbourhoods):
    values = []
    for label in labels:
        value = groupby_neighbourhood_price[(groupby_neighbourhood_price['行政区名']==neighbourhood) & (groupby_neighbourhood_price['房屋类型']==label)].id.values
        value = 0 if not value else value[0]
        values.append(value)
    data = go.Pie(labels=labels, values=values, scalegroup=index, name=neighbourhood)
    # "radial", "tangential" “径向”,“切向”
    if index % 3 == 0:
        row += 1
        col = 1
    fig.add_trace(data, row, col)
    col += 1
fig.show()
image.png
上一篇 下一篇

猜你喜欢

热点阅读