编程

Python基础学习13

2019-01-28  本文已影响0人  ericblue

pandas库安装:

pip3 install pandas
Collecting pandas
  Downloading https://files.pythonhosted.org/packages/78/78/50ef81a903eccc4e90e278a143c9a0530f05199f6221d2e1b21025852982/pandas-0.23.4-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (14.6MB)
    100% |████████████████████████████████| 14.7MB 56kB/s
Requirement already satisfied: numpy>=1.9.0 in /Users/.virtualenvs/py3env/lib/python3.6/site-packages (from pandas) (1.15.4)
Collecting pytz>=2011k (from pandas)
  Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='pypi.org', port=443): Read timed out. (read timeout=15)",)': /simple/pytz/
  Downloading https://files.pythonhosted.org/packages/61/28/1d3920e4d1d50b19bc5d24398a7cd85cc7b9a75a490570d5a30c57622d34/pytz-2018.9-py2.py3-none-any.whl (510kB)
    100% |████████████████████████████████| 512kB 43kB/s
Collecting python-dateutil>=2.5.0 (from pandas)
  Downloading https://files.pythonhosted.org/packages/74/68/d87d9b36af36f44254a8d512cbfc48369103a3b9e474be9bdfe536abfc45/python_dateutil-2.7.5-py2.py3-none-any.whl (225kB)
    100% |████████████████████████████████| 235kB 26kB/s
Requirement already satisfied: six>=1.5 in /Users/.virtualenvs/py3env/lib/python3.6/site-packages (from python-dateutil>=2.5.0->pandas) (1.11.0)
Installing collected packages: pytz, python-dateutil, pandas
Successfully installed pandas-0.23.4 python-dateutil-2.7.5 pytz-2018.9

pandas的Series一维数组应用方法

from pandas import Series, DataFrame
import pandas as pd

obj = Series([4, 5, 6, -7])#pandas一维数组定义

print(obj)
#输出结果如下是带索引一组数据
0    4
1    5
2    6
3   -7
dtype: int64

print( obj.index)
#输出结果RangeIndex(start=0, stop=4, step=1)

print ( obj.values)
#输出结果[ 4  5  6 -7]

字典中的key由哈希值生成唯一值不能修改,如果相同key值会导致对于value覆盖;列表(['a'])和集合({'b'})不能作为字典key,因为内容会变化。

obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'c', 'a'])# 定义带索引字典
print(obj2)
#输出结果如下
d    4
b    7
c   -5
a    3
dtype: int64

obj2['c'] = 6# 可以直接给对应索引给值
print(obj2)
# 输出结果如下
d    4
b    7
c    6
a    3
dtype: int64

print ('f' in obj2)#可查找是否存在此索引
#输出结果False

sdata = {
    'beijing': 35000,
    'shanghai': 71000,
    'guangzhou': 16000,
    'shenzhen': 5000}
obj3 = Series(sdata)#把字典转换为一维数组
print( obj3)
#输出结果如下
beijing      35000
shanghai     71000
guangzhou    16000
shenzhen      5000
dtype: int64

obj3.index = ['bj', 'gz', 'sh', 'sz']# 修改索引
print( obj3)
# 输出结果如下
bj    35000
gz    71000
sh    16000
sz     5000
dtype: int64

pandas的DataFrame多维数组应用方法

from pandas import Series, DataFrame

#字典中添加列表方式定义多维数据表格
data = {'city': ['shanghai', 'shanghai', 'shanghai', 'beijing', 'beijing'],
        'year': [2016, 2017, 2018, 2017, 2018],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

frame = DataFrame(data)
print(frame)
#输出结果如下
       city  year  pop
0  shanghai  2016  1.5
1  shanghai  2017  1.7
2  shanghai  2018  3.6
3   beijing  2017  2.4
4   beijing  2018  2.9

frame2 = DataFrame(data, columns=['year', 'city', 'pop'])#自定义key值排列顺序
print(frame2)
#输出结果如下
   year      city  pop
0  2016  shanghai  1.5
1  2017  shanghai  1.7
2  2018  shanghai  3.6
3  2017   beijing  2.4
4  2018   beijing  2.9

print(frame2['city'])#提取列值
#输出结果如下
0    shanghai
1    shanghai
2    shanghai
3     beijing
4     beijing
Name: city, dtype: object

print(frame2.year)#提取列值另一种方法
#输出结果如下
0    2016
1    2017
2    2018
3    2017
4    2018
Name: year, dtype: int64

frame2['new'] = 100#新增列
print(frame2)
#输出结果如下
   year      city  pop  new
0  2016  shanghai  1.5  100
1  2017  shanghai  1.7  100
2  2018  shanghai  3.6  100
3  2017   beijing  2.4  100
4  2018   beijing  2.9  100

frame2['cap'] = frame2.city == 'beijing'#带判断新增列
print( frame2)
#输出结果如下
   year      city  pop  new    cap
0  2016  shanghai  1.5  100  False
1  2017  shanghai  1.7  100  False
2  2018  shanghai  3.6  100  False
3  2017   beijing  2.4  100   True
4  2018   beijing  2.9  100   True

#另一种字典中嵌套方式定义多维数据表格
pop = {'beijing': {2008: 1.5, 2009: 2.0},
       'shanghai': {2008: 2.0, 2009: 3.6}
       }

frame3 = DataFrame(pop)
print(frame3)
#输出结果如下
      beijing  shanghai
2008      1.5       2.0
2009      2.0       3.6

print(frame3.T)#列行互换
#输出结果如下
          2008  2009
beijing    1.5   2.0
shanghai   2.0   3.6


obj4 = Series([4.5, 7.2, -5.3, 3.6], index=['b', 'd', 'c', 'a'])
obj5 = obj4.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)#调整索引顺序并给新增列给0默认值
print(obj5)
#输出结果如下
a    3.6
b    4.5
c   -5.3
d    7.2
e    0.0
dtype: float64

obj6 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print( obj6.reindex(range(6),method='bfill'))#给空值添加默认值,‘ffill’按上补充值,‘bfill’按下补充值
#输出结果如下
0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object


from numpy import nan as NA   #应用空值

data = Series([1, NA, 2])#给空值
print(data.dropna())#删除空值
#输出结果如下
0    1.0
2    2.0
dtype: float64

data2 = DataFrame([[1., 6.5, 3], [1., NA, NA], [NA, NA, NA]
                  ])
data2[4] = NA#给第4列给空值
print(data2)
#输出结果如下
  0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN

print(data2.dropna(how='all'))#删除整行为空的行
#输出结果如下
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN

print(data2.dropna(axis=1, how='all'))#删除整列为空的列
#输出结果如下
0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN

data2.fillna(0)
print(data2.fillna(0, inplace=True))#填充缺失值为0,True为更新结果到data2
#输出结果None
print(data2)#更新结果后输出被修改
#输出结果如下
   0    1    2    4
0  1.0  6.5  3.0  0.0
1  1.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0

层次化索引

import numpy as np

#建立两层索引
data3 = Series(np.random.randn(10),
               index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                      [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
print (data3)
#输出结果如下
a  1   -0.606962
   2   -0.793390
   3    0.515835
b  1   -0.269941
   2   -0.613685
   3   -0.078791
c  1    1.622026
   2   -0.342152
d  2   -0.331359
   3    0.719142
dtype: float64

print ( data3['b':'c'])#取索引对应值
#输出结果如下
b  1    0.024265
   2    0.140279
   3    1.465150
c  1   -1.049863
   2    1.673730
dtype: float64

print(data3.unstack())#一维层次化索引转换为二维dataframe数组
#输出结果如下
          1         2         3
a  0.052463 -0.868392  0.387425
b  0.041187  0.116177 -0.395136
c  0.585591 -0.465362       NaN
d       NaN  0.586438 -0.140192

print(data3.unstack().stack())#还原一维层次化索引
#输出结果如下
a  1    0.052463
   2   -0.868392
   3    0.387425
b  1    0.041187
   2    0.116177
   3   -0.395136
c  1    0.585591
   2   -0.465362
d  2    0.586438
   3   -0.140192
dtype: float64



上一篇下一篇

猜你喜欢

热点阅读