Pandas数据操作归纳总结
2019-08-14 本文已影响0人
Byte猫
Pandas建立在NumPy之上,旨在与许多其他第三方库完美地集成在科学计算环境中。它的两个主要数据结构是Series(1维)和DataFrame(2维)。
一、Pandas数据结构:一维数组(Series)
1、创建Series数组
Series是一个一维的数据类型,其中每一个元素都有一个标签。类似于Numpy中元素带标签的数组。其中,标签可以是数字或者字符串。
import numpy as np
import pandas as pd
# 直接创建
a = pd.Series([1, 2, 5, 6, 8])
print(a)
# 用标量创建
b = pd.Series(5, index = [0,1,2,3])
print(b)
# 从numpy数组创建
data1 = np.array(['a','b', 3, 4])
c = pd.Series(data1, index = ['a','b','c','d'])
print(c)
# 从字典创建
data2 = {'a':1, 'b':2, 'c':3}
d = pd.Series(data2, index = ['a','b','c','d'])
print(d)
2、Series数组的索引
Series对象提供了几种不同的索引方法。
(1)枚举索引
枚举索引按照index值进行索引。
index值默认使用下标,下标总是从0开始的,而且索引值总是数字。也可以用关键字为index赋值,关键字既可以是数字,也可以是字符串等。
import numpy as np
import pandas as pd
# 使用下标作为index(默认)
a = pd.Series([1, 2, 5, 6, 8])
print(a.index)
print(a.values) # [1 2 5 6 8]
print(a[4]) # 8
print(a[[1,3]].values) # [2, 6]
# 使用关键字作为index
b = pd.Series([1, 2, 5, 6, 8], index = [1, 2, 'k', 'j', 'k'])
print(b.index)
print(b.values) # [1 2 5 6 8]
print(b[4]) # 优先查找关键字如果没有匹配的关键字则按照下标查
print(b[[2,'j']].values) # [2, 6]
print(b[[2,'k']].values) # [2 5 8]
(2)布尔索引
import numpy as np
import pandas as pd
# 使用布尔索引
a = pd.Series([1, 2, 5, 6, 8], index=['a', 'b', 'c', 'd', 'd'])
print(a.index)
print(a.values) # [1 2 5 6 8]
print(a>3) # False False True True True
print(a[a>3].values) # [5 6 8]
(3)切片索引
切片索引使用的是下标,不会受到index的影响
import numpy as np
import pandas as pd
a = pd.Series([1, 2, 5, 6, 8])
print(a[1:3].values) # [2 5]
b = pd.Series([1, 2, 5, 6, 8], index = [1, 2, 3, 4, 5])
print(b[1:3].values) # [2 5]
二、Pandas数据结构:数据表(Dataframe)
Pandas提供了一种名为Dataframe的二维表结构。可以把DataFrame想象成一个电子表格,它由行名(index)、列名(columns)和数据(values)组成。
1、创建Dataframe数据表
(1)从字典创建
import numpy as np
import pandas as pd
data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
# 使用默认的下标作为行(index)
df1 = pd.DataFrame(data)
print(df1)
# 使用提供的关键字作为行(index)
df2 = pd.DataFrame(data, index=['A','B','C'])
print(df2)
(2)从Series创建
import numpy as np
import pandas as pd
data = {"a":pd.Series([1,2,3],['A','B','C']),
"b":pd.Series([4,5,6],['A','B','C']),
"c":pd.Series([7,8,9],['A','B','C'])}
df = pd.DataFrame(data)
print(df)
(3)从二维数组直接创建
由二维数组直接创建DataFrame,得到一样形状的结果数据,如果不指定index和columns,两者均返回默认数字格式
import numpy as np
import pandas as pd
data = np.random.rand(9).reshape(3,3)
df = pd.DataFrame(data, index = ['a', 'b', 'c'], columns = ['one','two','three'])
print(df)
2、Dataframe数据表的索引
(1)枚举索引
import numpy as np
import pandas as pd
data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df = pd.DataFrame(data, index=['x','y','z'])
print(df)
# 访问某列的某个元素
print(df['a'][0])
print(df['a']['x'])
# 访问某行的某个元素
print(df.T['x'][0])
print(df.T['x']['b'])
# 访问某列的某几个元素
print(df['a'][['x','y','z']].values)
print(df['a'][[0,1,2]].values)
(2)布尔索引
import numpy as np
import pandas as pd
data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df = pd.DataFrame(data, index=['x','y','z'])
print(df)
# 单个列的布尔索引
print(df['b']>3)
print(df['b'][df['b']>3])
print(df['b'][df['b']>3].values)
# 整个表的布尔索引
print(df>3)
print(df[df>3])
print(df[df>3].values)
(3)切片索引
import numpy as np
import pandas as pd
data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df = pd.DataFrame(data, index=['x','y','z'])
print(df)
# 单个列的切片索引
print(df['b'][0:2])
# 整个表的切片索引
print(df[0:1][0:2])
(4)loc和iloc索引
import numpy as np
import pandas as pd
data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index=['x','y','z'])
# loc索引
print(df1.loc[1, 'a'])
print(df1.loc[[0,1,2], 'a'])
print(df1.loc[0:2, 'a'])
print(df2.loc['y', 'a'])
print(df2.loc[['x','y','z'], 'a'])
# iloc索引
print(df1.iloc[1, 0])
print(df1.iloc[[0,1,2], 0])
print(df1.iloc[0:3, 0])
print(df2.iloc[1, 0])
print(df2.iloc[[0,1,2], 0])
3、多索引Dataframe数据表
# 创建多索引
midx = pd.MultiIndex(levels=[['Tom', 'Bob', 'Jam'], ['income', 'weight', 'length']],
codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],[0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx,
columns=['max', 'min'],
data=[[200, 100],[55, 50], [1.5, 1.4], # Tom收入,体重,身高
[400, 300],[65, 60], [1.6, 1.5], # Bob收入,体重,身高
[600, 500],[75, 70], [1.8, 1.7]]) # Jam收入,体重,身高
print(df)
三、Pandas数据 I/O
import numpy as np
import pandas as pd
data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df = pd.DataFrame(data)
# 保存和读取csv格式
df.to_csv('01.csv', index=False, encoding='utf-8')
df1 = pd.read_csv('01.csv')
print(df1)
# 保存和读取execl格式,需安装xlrd和openpyxl
df.to_excel('02.xlsx', index=False, encoding='utf-8')
df2 = pd.read_excel('02.xlsx')
print(df2)
# 保存和读取txt格式
df.to_csv('03.txt', sep='@', index=False, encoding='utf-8')
df3 = pd.read_csv('03.txt', sep='@')
print(df3)
# 保存和读取pickling格式
df.to_pickle('04.pkl')
df4 = pd.read_pickle('04.pkl')
print(df4)
四、Series和DataFrame的基本操作
1、Series和DataFrame添加删除元素
(1)Series添加删除元素
import numpy as np
import pandas as pd
s1 = pd.Series([-1, -2, -3], index=['a', 'b', 'c'])
s2 = pd.Series([1, 2, 3, 4], index=['e', 'f', 'c', 'd'])
# 插入元素
s1['d'] = -4
s3 = s1.append(pd.Series([-4], index=['a'])) # verify_integrity=True 标签重复报错
print(s3)
s4 = s1.append(s2)
print(s4)
# 删除元素
s1.drop('d', inplace=False) # 设置为True时修改原Series
print(s1)
s1.pop('d')
print(s1)
(1)DataFrame添加删除元素
import numpy as np
import pandas as pd
df = pd.DataFrame({'one' : pd.Series([-1, -2, -3]),
'two' : pd.Series([1, 2, 3, 4])})
# 添加新列
df['three'] = pd.Series([10,20,30])
df['four'] = df['one']+df['three']
df.loc[:, "five"] = [1, 4, 5, 9]
print(df)
# 添加新行
temp_df = pd.DataFrame([[5, 6], [7, 8]], columns=['one','two'])
df = df.append(temp_df, sort=False, ignore_index=True) # ignore_index=True用于重新生成索引
print(df)
df.loc[df.shape[0]] = [0, 0, 0, 0, 0]
print(df)
# 删除行
df.drop(3, axis=0, inplace=True)
print(df)
# 删除列
df.drop(['four','five'], axis=1, inplace=True)
print(df)
2、Series和DataFrame数据排重
import numpy as np
import pandas as pd
s = pd.Series([-1, -1, -2, -2, -3])
df = pd.DataFrame({'k1':['one','two']*3+['two'],
'k2':[1,1,1,2,1,3,3]})
# 查看是否重复
print(s.duplicated())
print(df.duplicated())
# 删除所有重复
print(s.drop_duplicates())
print(df.drop_duplicates())
# 删除某列的重复值
print(df.drop_duplicates(['k1']))
3、Series和DataFrame数据替换和映射
(1)数据替换
import numpy as np
import pandas as pd
s = pd.Series([-1, -1, -2, -2, -3])
df = pd.DataFrame({'k1':['one','two']*3+['two'],
'k2':[1,1,1,2,1,3,3]})
# Series数据替换
print(s.replace(-3, 999))
print(s.replace([-1,-2], 3))
print(s.replace({-1:'a',-2:'b'}))
# DataFrame数据替换
print(df.replace(1, 999))
print(df.replace([1,2], 3))
print(df.replace({1:'a',2:'b'}))
(2)数据映射
map()函数就是一个数据到一个数据的映射关系,中间没有对数据的增删改
map(function,list)就是将list里面的数据取出来然后用于function
import numpy as np
import pandas as pd
df = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
###################### 方法一 ########################
# 字符串数据才有lower()
lowercased = df['food'].str.lower()
print(lowercased)
# 数据映射
print(lowercased.map(meat_to_animal))
df['animal1']=lowercased.map(meat_to_animal)
print(df)
###################### 方法二 ########################
# df['animal2']=df['food'].map(lambda x : meat_to_animal[x.lower()])
transform = lambda x : meat_to_animal[x.lower()]
df['animal2']=df['food'].map(transform)
print(df)
4、DataFrame行列变换
(1)使用 numpy 实现转置
import numpy as np
import pandas as pd
df = pd.DataFrame([[0, 1, 2],
[3, 4, 5]], columns=['c1', 'c2', 'c3'])
print(df)
df2 = pd.DataFrame(df.values.T, index=df.columns, columns=df.index)
print(df2)
(2)利用 pandas 多级索引实现转置
stack() 会将数据的列"旋转"为行,新的行索引比原来的行索引低了一个级别(0的级别最高)。
unstack() 将数据的行"旋转"为列,默认旋转的是级别最低的索引。
import numpy as np
import pandas as pd
df = pd.DataFrame([[0, 1, 2],
[3, 4, 5]], columns=['c1', 'c2', 'c3'])
print(df)
# 列索引转为行索引
df = df.stack()
print(df)
# 最高级的行索引转为列索引
df = df.unstack(0)
print(df)
# 一步完成转换
print(df.stack().unstack(0))
5、DataFrame合并与拆分
(1)堆叠
concat()能够实现轴向数据集的堆叠
import numpy as np
import pandas as pd
s1 = pd.Series([0, 1])
s2 = pd.Series([2, 3, 4])
s3 = pd.Series([5, 6])
# 纵向堆叠
df1 = pd.concat([s1, s2, s3], ignore_index=True) # 默认axis=0
print(df1)
df2 = pd.concat([s1, s2, s3])
df2.set_axis(range(len(df2)), inplace=True)
print(df2)
# 横向堆叠
df3 = pd.concat([s1, s2, s3], axis=1)
print(df3)
(2)连接
merge()可根据一个或者多个键将不同的DataFrame连接在一起,类似于SQL数据库中的join操作
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'userid': [1,2,3,4], 'level': list('ssab')})
df2 = pd.DataFrame({'userid': [1,3,5], 'age': [16, 22, 34]})
print(pd.merge(df1, df2)) # 默认使用交集(how='inner')作为合并方式
print(pd.merge(df1, df2, how='outer')) # 使用并集作为合并方式
print(pd.merge(df1, df2, how='left')) # 左合并
print(pd.merge(df1, df2, how='right')) # 右合并
print(pd.merge(df1, df2, on=['userid'])) # 指定连接列
(3)拆分
import numpy as np
import pandas as pd
df = pd.DataFrame([['Computer', 'Mac-Dell'],
['Computer', 'Mac-XiaoMi'],
['Computer', 'Mac-HuaWei-9X']], columns=['Type', 'Brands_rank'])
print(df)
# 一列拆多列
df['Brands-Second'] = df.Brands_rank.apply(lambda x: x.split('-')[1])
df['Brands-Third'] = df.Brands_rank.apply(lambda x: x.split('-')[2] if x.count('-') >= 2 else np.nan)
print(df)
# 一行拆多行
df2 = df
df2 = df.Brands_rank.str.split('-', expand=True).stack().to_frame()
df2 = df2.reset_index(level=1, drop=True).rename(columns={0:'Brands'})
print(df[['Type']].join(df2))
6、DataFrame排序
import numpy as np
import pandas as pd
df = pd.DataFrame([[0, 1, 2],
[3, 4, 5]], columns=['c1', 'c2', 'c3'])
############ 列索引排序 #############
# 按行索引升序排序
print(df.sort_index())
# 按行索引降序排序
print(df.sort_index(ascending=False))
############ 行索引排序 #############
# 按列索引升序排序
print(df.sort_index(axis=1))
# 按列索引降序排序
print(df.sort_index(axis=1, ascending=False))
############ 值排序 #################
# 按某列的值升序排序
print(df.sort_values(by='c2'))
# 按某列的值降序排序
print(df.sort_values(by='c2', ascending=False))
# 多列值降序排序
print(df.sort_values(by=['c2', 'c3'], ascending=False))