8. 日月光华 Python数据分析-Pandas-合并数据集

2023-07-08  本文已影响0人  薛东弗斯
import pandas as pd
import numpy as np

data1 = pd.DataFrame({'one':['a','b','a','a','c','b'],'two':range(6)})    # 列明分别为one/two 键值即是列名
data2 = pd.DataFrame({'one':['a','b','c','d'],'two':range(10,14)})

data1
#   one two
# 0 a   0
# 1 b   1
# 2 a   2
# 3 a   3
# 4 c   4
# 5 b   5

data2
#   one two
# 0 a   10
# 1 b   11
# 2 c   12
# 3 d   13

data = pd.merge(data1, data2, on='one', how='right')  
# pd.merge 列与列之间进行横向合并
# on='one' 以one列为基准,根据one这一列进行合并
# how='right'  以右边的data2为准
# how=’left‘ 以坐标的data1为准
# how=outer  并集,没有的值用NA值不错
# how=inner 交集

data       # how=’right‘,以右边的data2中的one这一列为准。data1对应为准没有的部分以NaN填充
#   one two_x   two_y
# 0 a   0.0 10
# 1 a   2.0 10
# 2 a   3.0 10
# 3 b   1.0 11
# 4 b   5.0 11
# 5 c   4.0 12
# 6 d   NaN 13

data = pd.merge(data1, data2, on='one', how='left')   # 以左边为准,最后只保留data1中one这一列的值
data
# one   two_x   two_y
# 0 a   0   10
# 1 b   1   11
# 2 a   2   10
# 3 a   3   10
# 4 c   4   12
# 5 b   5   11
data1 = pd.DataFrame({'one1':['a','b','a','a','c','b'],'two1':range(6)})
data2 = pd.DataFrame({'one2':['a','b','c','d'],'two2':range(10,14)})

data1
#   one1    two1
# 0 a   0
# 1 b   1
# 2 a   2
# 3 a   3
# 4 c   4
# 5 b   5

data2
#   one2    two2
# 0 a   10
# 1 b   11
# 2 c   12
# 3 d   13

data = pd.merge(data1, data2, left_on='one1', right_on='one2')   # 默认是inner的合并,因此d这一行就欸有体现了
data
#   one1    two1    one2    two2
# 0 a   0   a   10
# 1 a   2   a   10
# 2 a   3   a   10
# 3 b   1   b   11
# 4 b   5   b   11
# 5 c   4   c   12

data = pd.merge(data1, data2, left_on='two1', right_index=True, how='outer')   # 根据index进行合并
data
#   one1    two1    one2    two2
# 0 a                 0 a   10.0
# 1 b                 1 b   11.0
# 2 a                 2 c   12.0
# 3 a                 3 d   13.0
# 4 c                    4  NaN NaN
# 5 b                 5 NaN NaN

data = pd.merge(data1, data2, left_on='two1', right_index=True) 
data
#   one1    two1    one2    two2
# 0 a   0   a   10
# 1 b   1   b   11
# 2 a   2   c   12
# 3 a   3   d   13

join,要求两个dataframe的index行数必须相等

data1
#   one1    two1
# 0 a   0
# 1 b   1
# 2 a   2
# 3 a   3

data2
#   one2    two2
# 0 a   10
# 1 b   11
# 2 c   12
# 3 d   13

data1.join(data2)   # 要求data1 data2 index相同(如果不同,以data1为准,针对data2没有的部分补充NaN),且没有重叠的列
#   one1    two1    one2    two2
# 0 a   0   a   10
# 1 b   1   b   11
# 2 a   2   c   12
# 3 a   3   d   13

assign 快速添加列

data1
#   one1    two1
# 0 a   0
# 1 b   1
# 2 a   2
# 3 a   3

data = data1.assign(three=np.arange(4))   # 添加一列,列名为three,该列对应的行数必须与data1相同
data
#   one1    two1    three
# 0 a   0   0
# 1 b   1   1
# 2 a   2   2
# 3 a   3   3

纵向合并,增加行

data1 = pd.DataFrame(np.random.randn(3,4), columns = ['a','b','c','d'])
data2 = pd.DataFrame(np.random.randn(2,3), columns = ['b','d','a'])
data1
#         a                b                        c                    d
# 0 0.151172            -0.173789   -0.484290   0.078217
# 1 -0.775621   -0.317611   -0.739203   -1.568968
# 2 -0.445466   1.300033    0.438614    -1.535612

data2
#         b             d                   a
# 0 0.538695            -0.081372   1.291073
# 1 -2.111805   -0.443657   -0.112507

data = pd.concat([data1, data2], ignore_index=True)
data
#   a                     b                 c                   d
# 0 0.151172            -0.173789   -0.484290   0.078217
# 1 -0.775621   -0.317611   -0.739203   -1.568968
# 2 -0.445466   1.300033            0.438614            -1.535612
# 3 1.291073            0.538695        NaN                 -0.081372
# 4 -0.112507   -2.111805   NaN                 -0.443657
上一篇下一篇

猜你喜欢

热点阅读