我爱编程

26 比较咖啡店各类饮品的数量与热量

2018-07-24  本文已影响35人  夏威夷的芒果
image.png image.png series数据的结构 dataframe 分组操作

数据源和格式

pandas 数据源下载地址:https://video.mugglecode.com/data_pd

image.png

本次分析只关心A和D列.

代码

import matplotlib.pyplot as plt
import pandas as pd
import os

#比较咖啡厅菜单中各类饮品的热量

file_path = '/Users/miraco/PycharmProjects/DataMining/data_pd/coffee_menu.csv'
outpath = './coffee_stat/ouptput'

#os.mkdir 與 os.makedirs 的差別在於 os.makedirs 會遞迴地去建立目錄,也就是說連同中繼的目錄也會一起建立
if not os.path.exists(outpath):
    os.makedirs(outpath)

def collect_data():
    data_df = pd.read_csv(file_path)  #这是二维数组
    return data_df
def inspect_data(data_df):
    #数据有噪声的时候,读取为保险起见,会被读取成obj类型
    print(f'数据一共有{data_df.shape[0]}行, {data_df.shape[1]}列')
    print('-----------------------------------------------------')
    print('数据预览:')
    # 如果想看又怕太多,可以用data_df.head(),只显示前几行
    print(data_df.head())
    print('-----------------------------------------------------')
    print('数据的基本信息:')
    # data_df.info()可以看数据类型,字符串看成obj类型,数字会自动读取成float或int
    print(data_df.info())
    print('-----------------------------------------------------')
    print('数据统计信息')
    #均值、最大值、最小值啥的
    print(data_df.describe())
    print('-----------------------------------------------------')

def analyze_data(data_df):
    bever_cate_col = data_df['Beverage_category']  #这是series,一列,引用列名就行,


    bever_cates  = bever_cate_col.unique()  #一列数据去重,得到各唯一值
    print('饮品类别')
    print(bever_cates)
    print('-----------------------------------------------------')
    cate_grouped = data_df.groupby('Beverage_category')  #返回的是一种特殊的数据类型
    cate_count = cate_grouped['Calories'].count()
    cate_mean_cal = cate_grouped['Calories'].mean()


    return cate_count, cate_mean_cal


def save_and_show_results(cate_count,cate_mean_cal):
    cate_count.to_csv(os.path.join(outpath,'cate_count.csv'))
    cate_mean_cal.to_csv(os.path.join(outpath, 'cate_mean_cal.csv'))

    cate_count.plot(kind = 'bar')   #直接画,无需重复调用figure
    plt.title('Category Count')
    plt.tight_layout()
    plt.savefig(os.path.join(outpath, 'category_count.png'))
    plt.show()

    cate_mean_cal.plot(kind = 'bar')
    plt.title('Category Average Cals')
    plt.tight_layout()
    plt.savefig(os.path.join(outpath, 'category_ave_cals.png'))
    plt.show()

def main():
    #数据获取
    data_df = collect_data()

    #查看数据信息
    inspect_data(data_df)

    #分析
    cate_count, cate_mean_cal  = analyze_data(data_df)

    #结果展示
    save_and_show_results(cate_count, cate_mean_cal)

if __name__ == '__main__':
    main()


运行结果

数据一共有242行, 18列
-----------------------------------------------------
数据预览:
         Beverage_category      ...      Caffeine (mg)
0                   Coffee      ...                175
1                   Coffee      ...                260
2                   Coffee      ...                330
3                   Coffee      ...                410
4  Classic Espresso Drinks      ...                 75

[5 rows x 18 columns]
-----------------------------------------------------
数据的基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 18 columns):
Beverage_category            242 non-null object
Beverage                     242 non-null object
Beverage_prep                242 non-null object
Calories                     242 non-null int64
 Total Fat (g)               242 non-null object
Trans Fat (g)                242 non-null float64
Saturated Fat (g)            242 non-null float64
 Sodium (mg)                 242 non-null int64
 Total Carbohydrates (g)     242 non-null int64
Cholesterol (mg)             242 non-null int64
 Dietary Fibre (g)           242 non-null int64
 Sugars (g)                  242 non-null int64
 Protein (g)                 242 non-null float64
Vitamin A (% DV)             242 non-null object
Vitamin C (% DV)             242 non-null object
 Calcium (% DV)              242 non-null object
Iron (% DV)                  242 non-null object
Caffeine (mg)                241 non-null object
dtypes: float64(3), int64(6), object(9)
memory usage: 34.1+ KB
None
-----------------------------------------------------
数据统计信息
         Calories  Trans Fat (g)       ...         Sugars (g)   Protein (g) 
count  242.000000      242.000000      ...         242.000000     242.000000
mean   193.871901        1.307025      ...          32.962810       6.978512
std    102.863303        1.640259      ...          19.730199       4.871659
min      0.000000        0.000000      ...           0.000000       0.000000
25%    120.000000        0.100000      ...          18.000000       3.000000
50%    185.000000        0.500000      ...          32.000000       6.000000
75%    260.000000        2.000000      ...          43.750000      10.000000
max    510.000000        9.000000      ...          84.000000      20.000000

[8 rows x 9 columns]
-----------------------------------------------------
饮品类别
['Coffee' 'Classic Espresso Drinks' 'Signature Espresso Drinks'
 'Tazo® Tea Drinks' 'Shaken Iced Beverages' 'Smoothies'
 'Frappuccino® Blended Coffee' 'Frappuccino® Light Blended Coffee'
 'Frappuccino® Blended Crème']
-----------------------------------------------------
这个竟然会自动配色

练习

使用柱状图可视化 PM2.5数值

  1. year: 年,2013-2015
  2. month: 月,1-12
  3. day: 日,1-31
  4. hour: 小时,0-23
  5. season:季度,1-4
  6. PM_China: 中国环保部检测的PM2.5值
  7. PM_US: 美国使馆检测的PM2.5值
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt
import os

file_path = '/Users/miraco/PycharmProjects/DataMining/data_pd/coffee_menu.csv'
outpath = './coffee_stat/ouptput'

#os.mkdir 與 os.makedirs 的差別在於 os.makedirs 會遞迴地去建立目錄,也就是說連同中繼的目錄也會一起建立
if not os.path.exists(outpath):
    os.makedirs(outpath)

# 1. 读取csv数据文件
data_df = pd.read_csv('/Users/miraco/PycharmProjects/DataMining/data_pd/Beijing_PM.csv')

# 2. 查看数据文件的基本信息
print('数据预览:')
print(data_df.head())

print('数据文件的基本信息:')
print(data_df.info())

print('数据内容的统计信息:')
print(data_df.describe())

# 4. 按照year列进行分组统计
year_average_pm = data_df.groupby('year')['PM_China'].mean()

# 4. 结果保存
year_average_pm.to_csv(os.path.join(outpath,'year_average_pm.csv'))

# 5. 绘制柱状图
year_average_pm.plot(kind='bar')
plt.tight_layout()
plt.show()

需要注意的是,数据里面有的列里面含有NA,这个在统计时候会跳过的。

运行结果

数据预览:
   year  month  day  hour  season  PM_China  PM_US
0  2013      1    1     0       4       NaN   31.0
1  2013      1    1     1       4       NaN   32.0
2  2013      1    1     2       4       NaN   21.0
3  2013      1    1     3       4       NaN   16.0
4  2013      1    1     4       4       NaN   15.0
数据文件的基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 7 columns):
year        26280 non-null int64
month       26280 non-null int64
day         26280 non-null int64
hour        26280 non-null int64
season      26280 non-null int64
PM_China    20508 non-null float64
PM_US       25970 non-null float64
dtypes: float64(2), int64(5)
memory usage: 1.4 MB
None
数据内容的统计信息:
               year         month      ...           PM_China         PM_US
count  26280.000000  26280.000000      ...       20508.000000  25970.000000
mean    2014.000000      6.526027      ...          92.560806     94.094686
std        0.816512      3.447917      ...          88.027434     93.806554
min     2013.000000      1.000000      ...           3.000000      1.000000
25%     2013.000000      4.000000      ...          28.000000     27.000000
50%     2014.000000      7.000000      ...          68.000000     66.000000
75%     2015.000000     10.000000      ...         127.000000    126.000000
max     2015.000000     12.000000      ...         672.000000    886.000000

[8 rows x 7 columns]

这个图也是自动赋色的

关键语句

#读取csv数据文件
data_df = pd.read_csv('/Users/miraco/PycharmProjects/DataMining/data_pd/Beijing_PM.csv')
#按照year列进行分组统计
year_average_pm = data_df.groupby('year')['PM_China'].mean()
#结果保存
year_average_pm.to_csv(os.path.join(outpath,'year_average_pm.csv'))
#绘制柱状图
year_average_pm.plot(kind='bar')
上一篇 下一篇

猜你喜欢

热点阅读