Python数据分析笔记-09
1.DataFrame的合并
1)未指定基于哪一列合并
>>> import numpy as np
>>> import pandas as pd
>>> frame1=pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],'price':[12.33,11.44,33.21,13.23,33.62]})
>>> frame1
id price
0 ball 12.33
1 pencil 11.44
2 pen 33.21
3 mug 13.23
4 ashtray 33.62
>>> frame2=pd.DataFrame({'id':['pencil','pencil','ball','pen'],'color':['white','red','red','black']})
>>> frame2
color id
0 white pencil
1 red pencil
2 red ball
3 black pen
>>> pd.merge(frame1,frame2) #frame1放到前,则依据frame1去合并,字段顺序也是先frame1的 后frame2的,且只保留二者都有的
id price color
0 ball 12.33 red
1 pencil 11.44 white
2 pencil 11.44 red
3 pen 33.21 black
>>> pd.merge(frame2,frame1)
color id price
0 white pencil 11.44
1 red pencil 11.44
2 red ball 12.33
3 black pen 33.21
2)指定基于哪一列进行合并:当两个DataFrame对象有不止一个列名相同的时候,合并的时候需用指定依据哪个列进行合并
>>> import numpy as np
>>> import pandas as pd
>>> frame1=pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],'color':['white','red','red','black','green'],'brand':['OMG','ABC','ABC','POD','POD']})
>>> frame1
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
>>> frame2=pd.DataFrame({'id':['pencil','pencil','ball','pen'],'brand':['OMG','POD','ABD','POD']})
>>> frame2
brand id
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> pd.merge(frame1,frame2)
Empty DataFrame
Columns: [brand, color, id]
Index: []
>>> pd.merge(frame1,frame2,on='id')
brand_x color id brand_y
0 OMG white ball ABD
1 ABC red pencil OMG
2 ABC red pencil POD
3 ABC red pen POD
>>> pd.merge(frame1,frame2,on='brand')
brand color id_x id_y
0 OMG white ball pencil
1 POD black mug pencil
2 POD black mug pen
3 POD green ashtray pencil
4 POD green ashtray pen
>>> pd.merge(frame2,frame1,on='id')
brand_x id brand_y color
0 OMG pencil ABC red
1 POD pencil ABC red
2 ABD ball OMG white
3 POD pen ABC red
>>> pd.merge(frame2,frame1,on='brand')
brand id_x color id_y
0 OMG pencil white ball
1 POD pencil black mug
2 POD pencil green ashtray
3 POD pen black mug
4 POD pen green ashtray
3)基于左对标和基于右对标:当两个DataFrame的列名不同时候,用left_on和right_on指定第一个dataframe用来对标的列和第二个dataframe用来对标的列,两列进行对标
>>> frame1
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
>>> frame2
brand id
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> frame2.columns=['brand','sid']
>>> frame2
brand sid
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> pd.merge(frame1,frame2,left_on='id',right_on='sid')
brand_x color id brand_y sid
0 OMG white ball ABD ball
1 ABC red pencil OMG pencil
2 ABC red pencil POD pencil
3 ABC red pen POD pen
4)外链接:相当于把所有列聚合倒一起,还有左链接和右链接
>>> frame2
brand sid
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> frame2.columns=['brand','id']
>>> frame2
brand id
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> pd.merge(frame1,frame2,on='id')
brand_x color id brand_y
0 OMG white ball ABD
1 ABC red pencil OMG
2 ABC red pencil POD
3 ABC red pen POD
>>> pd.merge(frame1,frame2,on='id',how='outer')
brand_x color id brand_y
0 OMG white ball ABD
1 ABC red pencil OMG
2 ABC red pencil POD
3 ABC red pen POD
4 POD black mug NaN
5 POD green ashtray NaN
>>> pd.merge(frame1,frame2,on='id',how='left')
brand_x color id brand_y
0 OMG white ball ABD
1 ABC red pencil OMG
2 ABC red pencil POD
3 ABC red pen POD
4 POD black mug NaN
5 POD green ashtray NaN
>>> pd.merge(frame1,frame2,on='id',how='right')
brand_x color id brand_y
0 OMG white ball ABD
1 ABC red pencil OMG
2 ABC red pencil POD
3 ABC red pen POD
>>> pd.merge(frame1,frame2,on=['id','brand'],how='outer')
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
5 OMG NaN pencil
6 POD NaN pencil
7 ABD NaN ball
8 POD NaN pen
5)根据索引合并
>>> frame1
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
>>> frame2
brand id
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> pd.merge(frame1,frame2,left_index=True,right_index=True)
brand_x color id_x brand_y id_y
0 OMG white ball OMG pencil
1 ABC red pencil POD pencil
2 ABC red pen ABD ball
3 POD black mug POD pen
6)join函数更适合利用所以进行合并索引相同列不一致的dataframe对象
列名相同的报错
>>> frame1
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
>>> frame2
brand id
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> frame1.join(frame2)#由于列名有重复的报错
Traceback (most recent call last):
File "", line 1, in
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 5316, in join
rsuffix=rsuffix, sort=sort)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 5331, in _join_compat
suffixes=(lsuffix, rsuffix), sort=sort)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/reshape/merge.py", line 58, in merge
return op.get_result()
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/reshape/merge.py", line 588, in get_result
rdata.items, rsuf)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/internals.py", line 5026, in items_overlap_with_suffix
to_rename)
ValueError: columns overlap but no suffix specified: Index(['brand', 'id'], dtype='object')
修改frame2的列名后利用join函数进行相同索引不同列名的
>>> frame1
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
>>> frame2
brand id
0 OMG pencil
1 POD pencil
2 ABD ball
3 POD pen
>>> frame2.columns=['brand2','id2']
>>> frame1.join(frame2)
brand color id brand2 id2
0 OMG white ball OMG pencil
1 ABC red pencil POD pencil
2 ABC red pen ABD ball
3 POD black mug POD pen
4 POD green ashtray NaN NaN
>>> frame2.join(frame1)
brand2 id2 brand color id
0 OMG pencil OMG white ball
1 POD pencil ABC red pencil
2 ABD ball ABC red pen
3 POD pen POD black mug
2.拼接
1)numpy的nparray对象进行拼接,利用np.concatenate函数
>>> import numpy as np
>>> array1=([0,1,2],[3,4,5],[6,7,8])
>>> array1
([0, 1, 2], [3, 4, 5], [6, 7, 8])
>>> array2=np.arange(9).reshape(3,3)
>>> array2
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
>>> np.concatenate([array1,array2],axis=1)
array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5],
[6, 7, 8, 6, 7, 8]])
>>> np.concatenate([array1,array2],axis=0)
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
2)pandas及他的Series和DataFrame等数据结构实现了带编号的轴,pandas的concat函数实现了按照轴进行拼接
Series对象拼接
>>> ser1=pd.Series(np.random.rand(4),index=[1,2,3,4])
>>> ser1
1 0.835800
2 0.421464
3 0.859449
4 0.974251
dtype: float64
>>> ser2=pd.Series(np.random.rand(4),index=[5,6,7,8])
>>> ser2
5 0.296453
6 0.692925
7 0.098937
8 0.324647
dtype: float64
>>> pd.concat([ser1,ser2])#默认按照0轴(行)进行拼接
1 0.835800
2 0.421464
3 0.859449
4 0.974251
5 0.296453
6 0.692925
7 0.098937
8 0.324647
dtype: float64
>>> pd.concat([ser1,ser2],axis=1)
0 1
1 0.835800 NaN
2 0.421464 NaN
3 0.859449 NaN
4 0.974251 NaN
5 NaN 0.296453
6 NaN 0.692925
7 NaN 0.098937
8 NaN 0.324647
>>> ser3=pd.concat([ser1,ser2],axis=1)
>>> ser3
0 1
1 0.835800 NaN
2 0.421464 NaN
3 0.859449 NaN
4 0.974251 NaN
5 NaN 0.296453
6 NaN 0.692925
7 NaN 0.098937
8 NaN 0.324647
>>> ser1
1 0.835800
2 0.421464
3 0.859449
4 0.974251
dtype: float64
>>> pd.concat([ser1,ser3],axis=1,join='inner')
0 0 1
1 0.835800 0.835800 NaN
2 0.421464 0.421464 NaN
3 0.859449 0.859449 NaN
4 0.974251 0.974251 NaN
>>> pd.concat([ser1,ser2],keys=[1,2])#在结果上创建等级
1 1 0.835800
2 0.421464
3 0.859449
4 0.974251
2 5 0.296453
6 0.692925
7 0.098937
8 0.324647
dtype: float64
>>> pd.concat([ser1,ser2],axis=1,keys=[1,2])#在1轴上拼接,生成DDAtaFrame对象
1 2
1 0.835800 NaN
2 0.421464 NaN
3 0.859449 NaN
4 0.974251 NaN
5 NaN 0.296453
6 NaN 0.692925
7 NaN 0.098937
8 NaN 0.324647
DataFrame对象拼接
>>> frame1=pd.DataFrame(np.random.rand(9).reshape(3,3),index=[1,2,3])
>>> frame2=pd.DataFrame(np.random.rand(9).reshape(3,3),index=[4,5,6])
>>> frame1
0 1 2
1 0.687920 0.176828 0.050595
2 0.174212 0.315637 0.780156
3 0.352718 0.613909 0.447205
>>> frame2
0 1 2
4 0.216871 0.389312 0.363118
5 0.253215 0.312054 0.760887
6 0.437853 0.847273 0.300629
>>> pd.concat([frame1,frame2],axis=1)
0 1 2 0 1 2
1 0.687920 0.176828 0.050595 NaN NaN NaN
2 0.174212 0.315637 0.780156 NaN NaN NaN
3 0.352718 0.613909 0.447205 NaN NaN NaN
4 NaN NaN NaN 0.216871 0.389312 0.363118
5 NaN NaN NaN 0.253215 0.312054 0.760887
6 NaN NaN NaN 0.437853 0.847273 0.300629
>>> pd.concat([frame1,frame2])
0 1 2
1 0.687920 0.176828 0.050595
2 0.174212 0.315637 0.780156
3 0.352718 0.613909 0.447205
4 0.216871 0.389312 0.363118
5 0.253215 0.312054 0.760887
6 0.437853 0.847273 0.300629
3.组合
无法通过合并和拼接的办法进行组合数据,两个数据集索引完全或者部分重合
cmbine_first函数可以用来组合Series对象,同时对齐数据
>>> ser1=pd.Series(np.random.rand(5),index=[1,2,3,4,5])
>>> ser1
1 0.428280
2 0.350504
3 0.832672
4 0.746660
5 0.698320
dtype: float64
>>> ser2=pd.Series(np.random.rand(4),index=[2,4,5,6])
>>> ser2
2 0.173355
4 0.668513
5 0.766825
6 0.683407
dtype: float64
>>> ser1.combine_first(ser2)
1 0.428280
2 0.350504
3 0.832672
4 0.746660
5 0.698320
6 0.683407
dtype: float64
>>> ser2.combine_first(ser1)
1 0.428280
2 0.173355
3 0.832672
4 0.668513
5 0.766825
6 0.683407
dtype: float64
>>> ser1
1 0.428280
2 0.350504
3 0.832672
4 0.746660
5 0.698320
dtype: float64
>>> ser2
2 0.173355
4 0.668513
5 0.766825
6 0.683407
dtype: float64
>>> ser1[:3].combine_first(ser2[:3])
1 0.428280
2 0.350504
3 0.832672
4 0.668513
5 0.766825
dtype: float64
4.按等级索引旋转
1)入栈:旋转数据结构把列转换为行
2)出栈:把行转换为列
>>> frame1=pd.DataFrame(np.arange(9).reshape(3,3),index=['white','black','red'],columns=['ball','pen','pencil'])
>>> frame1
ball pen pencil
white 0 1 2
black 3 4 5
red 6 7 8
>>> frame1.stack()
white ball 0
pen 1
pencil 2
black ball 3
pen 4
pencil 5
red ball 6
pen 7
pencil 8
dtype: int64
>>> ser5=frame1.stack()
>>> ser5
white ball 0
pen 1
pencil 2
black ball 3
pen 4
pencil 5
red ball 6
pen 7
pencil 8
dtype: int64
>>> ser5.unstack()
ball pen pencil
white 0 1 2
black 3 4 5
red 6 7 8
>>> ser5.unstack(0)
white black red
ball 0 3 6
pen 1 4 7
pencil 2 5 8
5.从长格式向宽格式转化
1)长格式数据
>>> longframe=pd.DataFrame({'color':['white','white','white','red','red','red','black','black','black'],'item':['ball','pen','mug','ball','pen','mug','ball','pen','mug'],'value':np.random.rand(9)})
>>> longframe
color item value
0 white ball 0.825575
1 white pen 0.595834
2 white mug 0.228163
3 red ball 0.543954
4 red pen 0.150930
5 red mug 0.285771
6 black ball 0.092283
7 black pen 0.961326
8 black mug 0.027581
2)转换成宽格式
>>> wideframe=longframe.pivot('color','item')
>>> wideframe
value
item ball mug pen
color
black 0.092283 0.027581 0.961326
red 0.543954 0.285771 0.150930
white 0.825575 0.228163 0.595834
6.删除
>>> frame1=pd.DataFrame(np.arange(9).reshape(3,3),index=['white','black','red'],columns=['ball','pen','pencil'])
>>> frame1
ball pen pencil
white 0 1 2
black 3 4 5
red 6 7 8
>>> del frame1['ball'] #删除一列
>>> frame1
pen pencil
white 1 2
black 4 5
red 7 8
>>> frame1.drop('white') #删除行
pen pencil
black 4 5
red 7 8
7.删除重复元素
>>> import numpy as np
>>> import pandas as pd
>>> dframe=pd.DataFrame({'color':['white','white','red','red','white'],'value':[2,1,3,3,2]})
>>> dframe
color value
0 white 2
1 white 1
2 red 3
3 red 3
4 white 2
>>> dframe.duplicated()
0 False
1 False
2 False
3 True
4 True
dtype: bool
>>> dframe[dframe.duplicated()]
color value
3 red 3
4 white 2
>>> dframe.drop_duplicates()
color value
0 white 2
1 white 1
2 red 3
8.映射
以下函数都以dict对象作为参数,也就是用能够承载映射的ditct字典对象作为参数
1)replace函数:替换元素
2)map函数:新建一列
3)rename函数:替换索引
用映射替换元素
>>> import numpy as np
>>> import pandas as pd
>>> frame=pd.DataFrame()
>>> frame=pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],'color':['white','rosso','verde','black','yellow'],'price':[5.56,4.20,1.30,0.56,2.75]})
>>> frame
color item price
0 white ball 5.56
1 rosso mug 4.20
2 verde pen 1.30
3 black pencil 0.56
4 yellow ashtray 2.75
>>> newcolors={'rosso':'red','verde':'green'}
>>> frame.replace(newcolors)
color item price
0 white ball 5.56
1 red mug 4.20
2 green pen 1.30
3 black pencil 0.56
4 yellow ashtray 2.75
>>> ser=pd.Series([1,3,np.nan,4,6,np.nan,3])
>>> ser
0 1.0
1 3.0
2 NaN
3 4.0
4 6.0
5 NaN
6 3.0
dtype: float64
>>> ser.replace(np.nan,0)
0 1.0
1 3.0
2 0.0
3 4.0
4 6.0
5 0.0
6 3.0
dtype: float64
用映射添加元素
>>> frame=pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],'color':['white','red','freen','black','yellow']})
>>> frame
color item
0 white ball
1 red mug
2 freen pen
3 black pencil
4 yellow ashtray
>>> price={'ball':5.56,'mug':4.20,'bottle':1.30,'scissors':3.41,'pen':1.30,'pencil':0.56,'ashtray':2.75}
>>> frame['price']=frame['item'].map(price)
>>> frame
color item price
0 white ball 5.56
1 red mug 4.20
2 freen pen 1.30
3 black pencil 0.56
4 yellow ashtray 2.75
重命名轴索引
>>> frame
color item price
0 white ball 5.56
1 red mug 4.20
2 freen pen 1.30
3 black pencil 0.56
4 yellow ashtray 2.75
>>> reindex={0:'frist',1:'second',2:'third',3:'fourth',4:'fifth'}
>>> frame.rename(reindex)
color item price
frist white ball 5.56
second red mug 4.20
third freen pen 1.30
fourth black pencil 0.56
fifth yellow ashtray 2.75
>>> recolumns={'item':'object','price':'value'}
>>> frame.rename(index=reindex,columns=recolumns)
color object value
frist white ball 5.56
second red mug 4.20
third freen pen 1.30
fourth black pencil 0.56
fifth yellow ashtray 2.75
>>> frame.rename(columns=recolumns)
color object value
0 white ball 5.56
1 red mug 4.20
2 freen pen 1.30
3 black pencil 0.56
4 yellow ashtray 2.75