Pandas之DataFrame练习笔记

2016-11-15 本文已影响813人 treelake

在 Matplotlib 和 NumPy 之外最流行的数据科学库就是 Pandas了。
在线尝试 Pandas Tutorial: DataFrames in Python。
最好的入门是10 Minutes to pandas，Pandas让数据操作更为便捷，从读取到保存都十分的简洁。比如看

读取文件

读取CSV

import pandas as pd
pd.read_csv('myDataFrame.csv')

读取Excel

pd.read_excel('myDataFrame.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

存储到文件

保存为CSV

df.to_csv('myDataFrame.csv', sep='\t', encoding='utf-8')

保存为Excel

df.to_excel('myDataFrame.xlsx', sheet_name='Sheet1')

Pandas logo

引入库

import pandas as pd
import numpy as np

numpy重温

np.ndarray是实际的数据类型，而np.array()是一个从其他数据结构创建numpy数组的函数。
结构化数组允许用户通过命名字段操作数据：在下面的示例中，创建了一个包含两个元组的结构化数组。每个元组的第一个元素称为“x”，为浮点类型，而第二个元素被命名为“y”，并且设置为整型。
此外，利用record array我们还可以对数组中的字段进行属性访问。

x = np.array([(1.0, 2), (3.0, 4)], dtype=[('x', float), ('y', int)])
# array([(1.0, 2), (3.0, 4)], 
#       dtype=[('x', '<f8'), ('y', '<i4')])
type(x)
# numpy.ndarray
x['x']
# array([ 1.,  3.])

# View the array as a record array:
x = x.view(np.recarray)
x.x
# array([ 1.,  3.])
x.y
# array([2, 4])

生成Pandas DataFrame

对于DataFrame我们可以自由地指定行索引(index )名称和列(column)名。行索引名指示行的差异，列名指示列的差异。

# 从np.array 转换为 pd.DataFrame
data = np.array([['','Col1','Col2'],
                ['Row1',1,2],
                ['Row2',3,4]])

# array([['',  'Col1', 'Col2'],
#        ['Row1', '1',  '2'],
#        ['Row2', '3',  '4']], 
#       dtype='<U4')

pd.DataFrame(data=data[1:,1:],
             index=data[1:,0],
             columns=data[0,1:])

#      Col1 Col2
# Row1    1    2
# Row2    3    4

### 此外还有多种方式

# 直接采用字典做参数
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print(pd.DataFrame(my_dict))
#    1  2  3
# 0  1  1  2
# 1  3  2  4

# 用一个DataFrame做输入参数
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print(pd.DataFrame(my_df))
#    A
# 0  4
# 1  5
# 2  6
# 3  7

# 输入一个Series
my_series = pd.Series({"United Kingdom":"London", "India":"New Delhi", "United States":"Washington", "Belgium":"Brussels"})
print(pd.DataFrame(my_series))
#                          0
# Belgium           Brussels
# India            New Delhi
# United Kingdom      London
# United States   Washington

如何选取元素

a = np.array([[''  ,'A','B','C'],
              ['x' , 1 , 2 , 3],
              ['y' , 4 , 5 , 6],
              ['z' , 7 , 8 , 9]])

df = pd.DataFrame(data=a[1:,1:], index=a[1:,0], columns=a[0,1:])

#    A  B  C
# x  1  2  3
# y  4  5  6
# z  7  8  9

# 选取index='x', column='A'的值为1的元素，
df.iloc[0][0]
df.loc['x']['A']
df.at['x','A']
df.iat[0,0]
df.get_value('x', 'A')

# 选取第一行
df.iloc[0]
# 选取A列
df.loc[:,'A']
df['A']

# 如果不指定行名，则有所区别
df2 = pd.DataFrame(data=a[1:,1:], columns=a[0,1:])
#    A  B  C
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9

# 以字符串索引的改为以数字索引
df2.loc[0]['A']
df2.at[0,'A']
df2.get_value(0, 'A')

# 此外 df.ix() 的选取方式更为灵活

元素替换

df = pd.DataFrame({'Student1':['OK', 'Awful', 'Acceptable'],
              'Student2':['Perfect', 'Awful', 'OK'],
              'Student3':['Acceptable', 'Perfect', 'Poor']})
#      Student1 Student2    Student3
# 0          OK  Perfect  Acceptable
# 1       Awful    Awful     Perfect
# 2  Acceptable       OK        Poor

df.replace(['Awful', 'Poor', 'OK', 'Acceptable', 'Perfect'], [0, 30, 60, 80, 100])
#    Student1  Student2  Student3
# 0        60       100        80
# 1         0         0       100
# 2        80        60        30

行列名重命名

a = np.array([[''  ,'A','B','C'],
              ['x' , 1 , 2 , 3],
              ['y' , 4 , 5 , 6],
              ['z' , 7 , 8 , 9]])

df = pd.DataFrame(data=a[1:,1:], index=a[1:,0], columns=a[0,1:])

# 定义旧名称到新名称的映射字典
newcols = {
    'A': 'new_column_1', 
    'B': 'new_column_2', 
    'C': 'new_column_3'
}

# 使用rename()函数重命名列
df.rename(columns=newcols, inplace=True)

# 重命名行标
df.rename(index={'x': 'a'})

#   new_column_1 new_column_2 new_column_3
# a            1            2            3
# y            4            5            6
# z            7            8            9

删除行

df
#    A  B  C
# x  1  2  3
# y  4  5  6
# z  7  8  9

df.drop(df.index[1])

#    A  B  C
# x  1  2  3
# z  7  8  9

删除列

df
#    A  B  C
# x  1  2  3
# y  4  5  6
# z  7  8  9

# Drop the column with label 'A'                  
df.drop('A', axis=1, inplace=True)

#    B  C
# x  2  3
# y  5  6
# z  8  9

# Drop the column at position 1
df.drop(df.columns[[1]], axis=1)

#    B
# x  2
# y  5
# z  8

去重行

#          48  49  50  50
#    2.5    1   2   3   4
#    12.6   4   5   6   5
#    4.8    7   8   9   6
#    4.8   23  50  60   7
#    2.5   23  35  37  23

df.drop_duplicates([48], keep='last')

#          48  49  50  50
#    2.5    1   2   3   4
#    12.6   4   5   6   5
#    4.8    7   8   9   6
#    2.5   23  35  37  23

去重行二

df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=[48, 49, 50])
#       48  49  50
# 2.5    1   2   3
# 12.6   4   5   6
# 4.8    7   8   9
# 4.8   40  50  60
# 2.5   23  35  37

df.reset_index().drop_duplicates(subset='index', keep='last').set_index('index')
#        48  49  50
# index            
# 12.6    4   5   6
# 4.8    40  50  60
# 2.5    23  35  37

按行迭代处理

a = np.array([[''  ,'A','B','C'],
              ['x' , 1 , 2 , 3],
              ['y' , 4 , 5 , 6],
              ['z' , 7 , 8 , 9]])

df = pd.DataFrame(data=a[1:,1:], index=a[1:,0], columns=a[0,1:])

for index, row in df.iterrows() :
    print(row['A'], row['B'])
# 1 2
# 4 5
# 7 8

将函数应用于行或列

doubler = lambda x: x*2
df['A'].apply(doubler)
# x    11
# y    44
# z    77
# Name: A, dtype: object

df['A'].map({'1':'xx', '4':8, '7':(0,1)})
# x        xx
# y         8
# z    (0, 1)
# Name: A, dtype: object

df.applymap(doubler)
#     A   B   C
# x  11  22  33
# y  44  55  66
# z  77  88  99

数据的重新组织

products = pd.DataFrame({'category': ['Cleaning', 'Cleaning', 'Entertainment', 'Entertainment', 'Tech', 'Tech'],
                         'store': ['Walmart', 'Dia', 'Walmart', 'Fnac', 'Dia','Walmart'],
                         'price':[11.42, 23.50, 19.99, 15.95, 19.99, 111.55],
                         'testscore': [4, 3, 5, 7, 5, 8]})

#         category   price    store  testscore
# 0       Cleaning   11.42  Walmart          4
# 1       Cleaning   23.50      Dia          3
# 2  Entertainment   19.99  Walmart          5
# 3  Entertainment   15.95     Fnac          7
# 4           Tech   19.99      Dia          5
# 5           Tech  111.55  Walmart          8

# 选取数据中的category为新纵轴，store为新横轴，显示值price
products.pivot(index='category', columns='store', values='price')
# store            Dia   Fnac  Walmart
# category                            
# Cleaning       23.50    NaN    11.42
# Entertainment    NaN  15.95    19.99
# Tech           19.99    NaN   111.55

# 不指定显示值时将剩余数据都显示出来
products.pivot(index='category', columns='store')
#                price                testscore             
# store            Dia   Fnac Walmart       Dia Fnac Walmart
# category                                                  
# Cleaning       23.50    NaN   11.42       3.0  NaN     4.0
# Entertainment    NaN  15.95   19.99       NaN  7.0     5.0
# Tech           19.99    NaN  111.55       5.0  NaN     8.0

从文件中导入数据时识别日期

import pandas as pd
dateparser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

# 'columnName'为日期所在列名
pd.read_csv(infile, parse_dates=['columnName'], date_parser=dateparse)

# 或者将两列合并成单个日期时间列
pd.read_csv(infile, parse_dates={'datetime': ['date', 'time']}, date_parser=dateparse)

Pandas之DataFrame练习笔记

读取文件

存储到文件

引入库

numpy重温

生成Pandas DataFrame

如何选取元素

元素替换

行列名重命名

删除行

删除列

去重行

去重行二

按行迭代处理

将函数应用于行或列

数据的重新组织

从文件中导入数据时识别日期

猜你喜欢

热点阅读