程序员Python 运维Python语言与信息数据获取和机器学习

Pandas数据分析包

2017-08-13  本文已影响140人  听城

Pandas是面板数据(Panel Data)的简写。它是Python最强大的数据分析和探索工具,因金融数据分析工具而开发,支持类似SQL的数据增删改查,支持时间序列分析,灵活处理缺失数据。

pandas的数据结构

# -*- coding: utf-8 -*- 

from pandas import Series

print('用数组生成Series')
obj = Series([4, 7, -5, 3])
print(obj)
print(obj.values)
print(obj.index)

print('指定Series的index')
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
print(obj2)
print(obj2.index)
print(obj2['a'])
obj2['d'] = 6
print(obj2[['c', 'a', 'd']])
print(obj2[obj2 > 0])  # 找出大于0的元素
print('b' in obj2) # 判断索引是否存在
print('e' in obj2)

print('使用字典生成Series')
sdata = {'Ohio':45000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = Series(sdata)
print(obj3)

print('使用字典生成Series,并额外指定index,不匹配部分为NaN。')
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index = states)
print(obj4)

print('Series相加,相同索引部分相加。')
print(obj3 + obj4)

print('指定Series及其索引的名字')
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)

print('替换index')
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

初始化
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('用字典生成DataFrame,key为列的名字。')
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
print(DataFrame(data))
print(DataFrame(data, columns = ['year', 'state', 'pop'])) # 指定列顺序

print('指定索引,在列中指定不存在的列,默认数据用NaN。')
frame2 = DataFrame(data,
                    columns = ['year', 'state', 'pop', 'debt'],
                    index = ['one', 'two', 'three', 'four', 'five'])
print(frame2)
print(frame2['state'])
print(frame2.year)#访问的不同方式
print(frame2.ix['three'])
frame2['debt'] = 16.5 # 修改一整列
print(frame2)
frame2.debt = np.arange(5)  # 用numpy数组修改元素
print(frame2)

print('用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。')
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)

print('赋值给新列')
frame2['eastern'] = (frame2.state == 'Ohio')  # 如果state等于Ohio为True
print(frame2)
print(frame2.columns)
print

print('DataFrame转置')
pop = {'Nevada':{2001:2.4, 2002:2.9},
        'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print(frame3.T)

print('指定索引顺序,以及使用切片初始化数据。')
print(DataFrame(pop, index = [2001, 2002, 2003]))
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevada':frame3['Nevada'][:2]}
print(DataFrame(pdata))

print('指定索引和列的名称')
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print(frame3.values)
print(frame2.values)

import numpy as np
import pandas as pd
import sys
from pandas import Series, DataFrame, Index

print('获取index')
obj = Series(range(3), index = ['a', 'b', 'c'])
index = obj.index
print(index[1:])
try:
    index[1] = 'd'  # index对象read only
except:
    print(sys.exc_info()[0])

print('使用Index对象')
index = Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index = index)
print(obj2)
print(obj2.index is index)

print('判断列和索引是否存在')
pop = {'Nevada':{20001:2.4, 2002:2.9},
        'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print('Ohio' in frame3.columns)
print('2003' in frame3.index)

pandas中主要的index对象

index
Index的方法和属性 method1 method2

基本功能 重新索引

• 创建一个适应新索引的新对象,该Series的reindex将会根据新索引进行重排。如果某个索引值当前不存在,就引入缺失值
• 对于时间序列这样的有序数据,重新索引时可能需要做一些插值处理。method选项即可达到此目的。

reindex参数
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import DataFrame, Series

print('重新指定索引及顺序')
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'd', 'c', 'e'])
print(obj2)
print(obj.reindex(['a', 'b', 'd', 'c', 'e'], fill_value = 0))  # 指定不存在元素的默认值

print('重新指定索引并指定填元素充方法')
obj3 = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
print(obj3)
#ffill用前一行相同列的数值填充
print(obj3.reindex(range(6), method = 'ffill'))

print('对DataFrame重新指定索引')
frame = DataFrame(np.arange(9).reshape(3, 3),
                  index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
print

print('重新指定column')
states = ['Texas', 'Utah', 'California']
print(frame.reindex(columns = states))

print('对DataFrame重新指定索引并指定填元素充方法')
print(frame.reindex(index = ['a', 'b', 'c', 'd'],
                    method = 'ffill',
                    columns = states))
print(frame.ix[['a', 'b', 'd', 'c'], states])

丢弃某些项
丢弃某条轴上的一个或多个项很简单,只要有一个索引数组或列表即可。由于需要执行一些数据整理和集合逻辑,所以drop方法返回的是一个在指定轴上删除了指定值的新对象

import numpy as np
from pandas import Series, DataFrame

print('Series根据索引删除元素')
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
print(new_obj)
print(obj.drop(['d', 'c']))

print('DataFrame删除元素,可指定索引或列。')
data = DataFrame(np.arange(16).reshape((4, 4)),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns = ['one', 'two', 'three', 'four'])
print(data)
print(data.drop(['Colorado', 'Ohio']))
print(data.drop('two', axis = 1))
print(data.drop(['two', 'four'], axis = 1))

索引、选取、过滤

DataFrame索引
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('Series的索引,默认数字索引可以工作。')
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
print(obj['b'])
print(obj[3])
print(obj[[1, 3]])
print(obj[obj < 2])

print('Series的数组切片')
print(obj['b':'c'] ) # 闭区间
obj['b':'c'] = 5
print(obj)

print('DataFrame的索引')
data = DataFrame(np.arange(16).reshape((4, 4)),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns = ['one', 'two', 'three', 'four'])
print(data)
print(data['two']) # 打印列
print(data[['three', 'one']])
print(data[:2])
print(data.ix['Colorado', ['two', 'three']]) # 指定索引和列
print(data.ix[['Colorado', 'Utah'], [3, 0, 1]])
print(data.ix[2]) # 打印第2行(从0开始)
print(data.ix[:'Utah', 'two']) # 从开始到Utah,第2列。

print('根据条件选择')
print(data[data.three > 5])
print(data < 5)  # 打印True或者False
data[data < 5] = 0
print(data)

算术运算和数据对齐
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('加法')
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)
print(s1 + s2)

print('DataFrame加法,索引和列都必须匹配。')
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
                columns = list('bcd'),
                index = ['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4, 3)),
                columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
print(df1 + df2)

print('数据填充')
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))
print(df1)
print(df2)
print(df1.add(df2, fill_value = 0))
print(df1.reindex(columns = df2.columns, fill_value = 0))

print('DataFrame与Series之间的操作')
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr - arr[0])
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print(frame)
print(series)
print(frame - series)
series2 = Series(range(3), index = list('bef'))
print(frame + series2)
series3 = frame['d']
print(series3)
print(frame.sub(series3, axis = 0))  # 按列减

函数应用和映射

# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('函数')
frame = DataFrame(np.random.randn(4, 3),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))

print('lambda以及应用')
f = lambda x: x.max() - x.min()
#列的最大值减去最小值
print(frame.apply(f))
#行的最大值减去最小值
print(frame.apply(f, axis = 1))
def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])
print(frame.apply(f))

print('applymap和map')
_format = lambda x: '%.2f' % x
print(frame.applymap(_format))
print(frame['e'].map(_format))

排序和排名

# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('根据索引排序,对于DataFrame可以指定轴。')
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
print(obj.sort_index())
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index = ['three', 'one'],
                  columns = list('dabc'))
print(frame.sort_index())
print(frame.sort_index(axis = 1))
print(frame.sort_index(axis = 1, ascending = False)) # 降序

print('根据值排序')
obj = Series([4, 7, -3, 2])
print(obj.sort_values()) # order已淘汰

print('DataFrame指定列排序')
frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by = 'b')) # sort_index(by = ...)已淘汰
print(frame.sort_values(by = ['a', 'b']))

print('rank,求排名的平均位置(从1开始)')
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
#rank 7为(6+7)/2
print(obj.rank())
print(obj.rank(method = 'first'))  # 去第一次出现,不求平均值。
print(obj.rank(ascending = False, method = 'max')) # 逆序,并取最大值。所以-5的rank是7.
frame = DataFrame({'b':[4.3, 7, -3, 2],
                  'a':[0, 1, 0, 1],
                  'c':[-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis = 1))
print('重复的索引')
obj = Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
print(obj.index.is_unique) # 判断是非有重复索引
print(obj['a'].ix[0])
print(obj.b.ix[1])
df = DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])
print(df)
print(df.ix['b'].ix[0])
print(df.ix['b'].ix[1])

统计方法

pandas 对象有一些统计方法。它们大部分都属于约简和汇总统计,用于从 Series 中提取单个值,或从 DataFrame 的行或列中提取一个 Series。
比如 DataFrame.mean(axis=0,skipna=True) 方法,当数据集中存在 NA 值时,这些值会被简单跳过,除非整个切片(行或列)全是 NA,如果不想这样,则可以通过 skipna=False 来禁用此功能:

常用方法选项 常用描述和汇总统计函数1 常用描述和汇总统计函数2
import numpy as np
from pandas import Series, DataFrame

print('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print(df)
print(df.sum())  # 按列求和
print(df.sum(axis = 1))  # 按行求和

print('平均数')
print(df.mean(axis = 1, skipna = False))
print(df.mean(axis = 1))

print('其它')
print(df.idxmax())
print(df.cumsum())
print(df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())

相关系数与协方差

pandas的数据处理常用方法总结

Series和DataFrame排序

Series排序

重命名DataFrame的Index

DataFrame的Merge操作

pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False)
Docstring:
Merge DataFrame objects by performing a database-style join operation by
columns or indexes.

If joining columns on columns, the DataFrame indexes *will be
ignored*. Otherwise if joining indexes on indexes or indexes on a column or
columns, the index will be passed on.

Parameters
----------
left : DataFrame
right : DataFrame
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
    * left: use only keys from left frame (SQL: left outer join)
    * right: use only keys from right frame (SQL: right outer join)
    * outer: use union of keys from both frames (SQL: full outer join)
    * inner: use intersection of keys from both frames (SQL: inner join)
on : label or list
    Field names to join on. Must be found in both DataFrames. If on is
    None and not merging on indexes, then it merges on the intersection of
    the columns by default.
left_on : label or list, or array-like
    Field names to join on in left DataFrame. Can be a vector or list of
    vectors of the length of the DataFrame to use a particular vector as
    the join key instead of columns
right_on : label or list, or array-like
    Field names to join on in right DataFrame or vector/list of vectors per
    left_on docs
left_index : boolean, default False
    Use the index from the left DataFrame as the join key(s). If it is a
    MultiIndex, the number of keys in the other DataFrame (either the index
    or a number of columns) must match the number of levels
right_index : boolean, default False
    Use the index from the right DataFrame as the join key. Same caveats as
    left_index
sort : boolean, default False
    Sort the join keys lexicographically in the result DataFrame
suffixes : 2-length sequence (tuple, list, ...)
    Suffix to apply to overlapping column names in the left and right
    side, respectively
copy : boolean, default True
    If False, do not copy data unnecessarily
indicator : boolean or string, default False
    If True, adds a column to output DataFrame called "_merge" with
    information on the source of each row.
    If string, column with information on source of each row will be added to
    output DataFrame, and column will be named value of string.
    Information column is Categorical-type and takes on a value of "left_only"
    for observations whose merge key only appears in 'left' DataFrame,
    "right_only" for observations whose merge key only appears in 'right'
    DataFrame, and "both" if the observation's merge key is found in both.

    .. versionadded:: 0.17.0

Examples
--------

>>> A              >>> B
    lkey value         rkey value
0   foo  1         0   foo  5
1   bar  2         1   bar  6
2   baz  3         2   qux  7
3   foo  4         3   bar  8

>>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
   lkey  value_x  rkey  value_y
0  foo   1        foo   5
1  foo   4        foo   5
2  bar   2        bar   6
3  bar   2        bar   8
4  baz   3        NaN   NaN
5  NaN   NaN      qux   7

Returns
-------
merged : DataFrame
    The output type will the be same as 'left', if it is a subclass
    of DataFrame.

Concatenate和Combine

np.concatenate(arr1,arr2)#默认是竖着增加,axis=1时横着增加,即增加列
combine_first,它实现既不是行之间的连接,也不是列之间的连接,它在修正数据,用一个DataFrame来填补前面的DataFrame中NAN的数据
Merge, join, and concatenate官方文档说明:http://pandas.pydata.org/pandas-docs/stable/merging.html

通过apply进行数据预处理

df['A'] = df['A'].apply(str.upper)

通过去重进行数据清洗

查看一列唯一值:df['A'].unique()
查看是否有重复:df['A'].duplicated()
删除重复数据:df.drop_duplicated(['A'])

时间序列

pd.date_range(start=None, end=None, periods=None, freq='D', tz=None, normalize=False, name=None, closed=None, **kwargs)
Docstring:
Return a fixed frequency datetime index, with day (calendar) as the default
frequency

Parameters
----------
start : string or datetime-like, default None
    Left bound for generating dates
end : string or datetime-like, default None
    Right bound for generating dates
periods : integer or None, default None
    If None, must specify start and end
freq : string or DateOffset, default 'D' (calendar daily)
    Frequency strings can have multiples, e.g. '5H'
tz : string or None
    Time zone name for returning localized DatetimeIndex, for example
Asia/Hong_Kong
normalize : bool, default False
    Normalize start/end dates to midnight before generating date range
name : str, default None
    Name of the resulting index
closed : string or None, default None
    Make the interval closed with respect to the given frequency to
    the 'left', 'right', or both sides (None)

Pandas中的resample,重新采样,是对原样本重新处理的一个方法,是一个对常规时间序列数据重新采样和频率转换的便捷的方法。
参照:http://blog.csdn.net/wangshuang1631/article/details/52314944

上一篇 下一篇

猜你喜欢

热点阅读