python

15 Pandas.DataFrame函数方法

2018-10-08  本文已影响36人  7125messi

1.abs(),返回DataFrame每个数值的绝对值,前提是所有元素均为数值型

import pandas as pd
import numpy as np
df=pd.read_excel('南京银行.xlsx',index_col='Date')
df1=df[:5]
df1.iat[0,1]=-df1.iat[0,1]
df1
            Open  High   Low  Close  Turnover    Volume
Date                                                   
2017-09-15  8.06 -8.08  8.03   8.04    195.43  24272800
2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
df1.abs()
            Open  High   Low  Close  Turnover      Volume
Date                                                     
2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800.0
2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600.0
2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100.0
2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700.0
2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600.0

2.add(other, axis='columns', level=None, fill_value=None) 将某个序列或表中的元素与本表中的元素相加,默认匹配列元素

ar1=[8.1,8.2,8.0,8.15,200.00,32000000]
cl1=['Open','High','Low','Close','Turnover','Volume']
se1=pd.Series(data=ar1,index=cl1)
se1

Open               8.10
High               8.20
Low                8.00
Close              8.15
Trunover         200.00
Volume      32000000.00
dtype: float64
df1.add(se1)
Open   High    Low  Close  Turnover      Volume
Date                                                        
2017-09-15  16.16   0.12  16.03  16.19    395.43  56272800.0
2017-09-18  16.15  16.33  16.03  16.21    400.76  56867600.0
2017-09-19  16.13  16.26  15.94  16.15    633.76  86253100.0
2017-09-20  16.07  16.26  15.95  16.18    519.94  71909700.0
2017-09-21  16.12  16.30  15.99  16.19    441.94  62056600.0
df1.add(df1)

             Open   High    Low  Close  Turnover     Volume
Date                                                       
2017-09-15  16.12 -16.16  16.06  16.08    390.86   48545600
2017-09-18  16.10  16.26  16.06  16.12    401.52   49735200
2017-09-19  16.06  16.12  15.88  16.00    867.52  108506200
2017-09-20  15.94  16.12  15.90  16.06    639.88   79819400
2017-09-21  16.04  16.20  15.98  16.08    483.88   60113200

3.add_prefix()和add_suffix()为列名添加前缀或后缀

df1.add_prefix('list')

            listOpen  listHigh  listLow  listClose  listTurnover  listVolume
Date                                                                        
2017-09-15      8.06      8.08     8.03       8.04        195.43    24272800
2017-09-18      8.05      8.13     8.03       8.06        200.76    24867600
2017-09-19      8.03      8.06     7.94       8.00        433.76    54253100
2017-09-20      7.97      8.06     7.95       8.03        319.94    39909700
2017-09-21      8.02      8.10     7.99       8.04        241.94    30056600
df1.add_suffix('list')

            Openlist  Highlist  Lowlist  Closelist  Turnoverlist  Volumelist
Date                                                                        
2017-09-15      8.06      8.08     8.03       8.04        195.43    24272800
2017-09-18      8.05      8.13     8.03       8.06        200.76    24867600
2017-09-19      8.03      8.06     7.94       8.00        433.76    54253100
2017-09-20      7.97      8.06     7.95       8.03        319.94    39909700
2017-09-21      8.02      8.10     7.99       8.04        241.94    30056600

4.agg(func, axis=0, *args, **kwargs),合计运算,常用的函数有min,max,prod,mean,std,var,median等

#所有列只做一种运算
df1.agg(sum)
Open        4.013000e+01
High        4.043000e+01
Low         3.994000e+01
Close       4.017000e+01
Turnover    1.391830e+03
Volume      1.733598e+08
dtype: float64
#所有列做两种运算
df1.agg(['sum','min'])
      Open   High    Low  Close  Turnover     Volume
sum  40.13  40.43  39.94  40.17   1391.83  173359800
min   7.97   8.06   7.94   8.00    195.43   24272800
#不同列做不同运算
df1.agg({'Open':['sum','min'],'Close':['sum','max']})
     Close   Open
max   8.06    NaN
min    NaN   7.97
sum  40.17  40.13

5.align(),DataFrame与Series或DataFrame之间连接运算,常用的有内联,外联,左联,右联

df2=df[3:5]
df2
Out[68]: 
            Open  High   Low  Close  Turnover    Volume
Date                                                   
2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
df1.align(df2,join='inner') #返回的为元组类型对象
(            Open  High   Low  Close  Turnover    Volume
 Date                                                   
 2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
 2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600,
             Open  High   Low  Close  Turnover    Volume
 Date                                                   
 2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
 2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600)
df1.align(df2,join='left')
Out[69]: 
(            Open  High   Low  Close  Turnover    Volume
 Date                                                   
 2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
 2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
 2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
 2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
 2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600,
             Open  High   Low  Close  Turnover      Volume
 Date                                                     
 2017-09-15   NaN   NaN   NaN    NaN       NaN         NaN
 2017-09-18   NaN   NaN   NaN    NaN       NaN         NaN
 2017-09-19   NaN   NaN   NaN    NaN       NaN         NaN
 2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700.0
 2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600.0)
df1.align(df2,join='left')[0]
Out[70]: 
            Open  High   Low  Close  Turnover    Volume
Date                                                   
2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600

6.all()和any(),判断选定的DataFrame中的元素是否全不为空或是否任意一个元素不为空,返回值为Boolean类型

df1.all(axis=0)
Out[72]: 
Open        True
High        True
Low         True
Close       True
Turnover    True
Volume      True
dtype: bool
df1.all(axis=1)
Out[73]: 
Date
2017-09-15    True
2017-09-18    True
2017-09-19    True
2017-09-20    True
2017-09-21    True
dtype: bool
df1.any()
Out[74]: 
Open        True
High        True
Low         True
Close       True
Turnover    True
Volume      True
dtype: bool

7.append(),在此表格尾部添加其他对象的行,返回一个新的对象

df2=df[5:7]
df2
Out[93]: 
            Open  High   Low  Close  Turnover    Volume
Date                                                   
2017-09-22  8.01  8.10  8.00   8.08    300.13  37212200
2017-09-25  8.06  8.07  7.97   7.99    262.30  32754500

df1.append(df2)
Out[94]: 
            Open  High   Low  Close  Turnover    Volume
Date                                                   
2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600
2017-09-22  8.01  8.10  8.00   8.08    300.13  37212200
2017-09-25  8.06  8.07  7.97   7.99    262.30  32754500
#更高效的方法
>>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
...           ignore_index=True)
   A
0  0
1  1
2  2
3  3
4  4

8.apply(func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds) 对于DataFrame的行或列应用某个函数

df1.apply(np.mean,axis=0)
Out[96]: 
Open        8.026000e+00
High        8.086000e+00
Low         7.988000e+00
Close       8.034000e+00
Turnover    2.783660e+02
Volume      3.467196e+07
dtype: float64
df1.apply(np.max,axis=1)
Out[97]: 
Date
2017-09-15    24272800.0
2017-09-18    24867600.0
2017-09-19    54253100.0
2017-09-20    39909700.0
2017-09-21    30056600.0
dtype: float64

9.applymap(func) 对DataFrame的元素应用某个函数

df1.applymap(lambda x:'%.3f' %x)
Out[100]: 
             Open   High    Low  Close Turnover        Volume
Date                                                         
2017-09-15  8.060  8.080  8.030  8.040  195.430  24272800.000
2017-09-18  8.050  8.130  8.030  8.060  200.760  24867600.000
2017-09-19  8.030  8.060  7.940  8.000  433.760  54253100.000
2017-09-20  7.970  8.060  7.950  8.030  319.940  39909700.000
2017-09-21  8.020  8.100  7.990  8.040  241.940  30056600.000

10.as_blocks()和as_matrix(),分别用于将DataFrame转化为以数据类型为键值的字典和将DataFrame转化为二维数组

df1.as_blocks()
Out[105]: 
{'float64':             Open  High   Low  Close  Turnover
 Date                                         
 2017-09-15  8.06  8.08  8.03   8.04    195.43
 2017-09-18  8.05  8.13  8.03   8.06    200.76
 2017-09-19  8.03  8.06  7.94   8.00    433.76
 2017-09-20  7.97  8.06  7.95   8.03    319.94
 2017-09-21  8.02  8.10  7.99   8.04    241.94, 'int64':               Volume
 Date                
 2017-09-15  24272800
 2017-09-18  24867600
 2017-09-19  54253100
 2017-09-20  39909700
 2017-09-21  30056600}
df1.as_matrix()
Out[106]: 
array([[  8.06000000e+00,   8.08000000e+00,   8.03000000e+00,
          8.04000000e+00,   1.95430000e+02,   2.42728000e+07],
       [  8.05000000e+00,   8.13000000e+00,   8.03000000e+00,
          8.06000000e+00,   2.00760000e+02,   2.48676000e+07],
       [  8.03000000e+00,   8.06000000e+00,   7.94000000e+00,
          8.00000000e+00,   4.33760000e+02,   5.42531000e+07],
       [  7.97000000e+00,   8.06000000e+00,   7.95000000e+00,
          8.03000000e+00,   3.19940000e+02,   3.99097000e+07],
       [  8.02000000e+00,   8.10000000e+00,   7.99000000e+00,
          8.04000000e+00,   2.41940000e+02,   3.00566000e+07]])

11.asfreq(freq, method=None, how=None, normalize=False, fill_value=None),将时间序列转化为特定的频度

#创建一个具有4个分钟时间戳的序列
>>> index=pd.date_range('1/1/2017',periods=4,freq='T')
>>> series=pd.Series([0.0,None,2.0,3.0],index=index)
>>> df=pd.DataFrame({'S':series})
>>> df
                       S
2017-01-01 00:00:00  0.0
2017-01-01 00:01:00  NaN
2017-01-01 00:02:00  2.0
2017-01-01 00:03:00  3.0
#将序列升采样以30秒为间隔的时间序列
>>> df.asfreq(freq='30S')
                       S
2017-01-01 00:00:00  0.0
2017-01-01 00:00:30  NaN
2017-01-01 00:01:00  NaN
2017-01-01 00:01:30  NaN
2017-01-01 00:02:00  2.0
2017-01-01 00:02:30  NaN
2017-01-01 00:03:00  3.0
#再次升采样,并将填充值设为5.0,可以发现并不改变升采样之前的数值
>>> df.asfreq(freq='30S',fill_value=5.0)
                       S
2017-01-01 00:00:00  0.0
2017-01-01 00:00:30  5.0
2017-01-01 00:01:00  NaN
2017-01-01 00:01:30  5.0
2017-01-01 00:02:00  2.0
2017-01-01 00:02:30  5.0
2017-01-01 00:03:00  3.0
#再次升采样,提供一个方法,对于空值,用后面的一个值填充
>>> df.asfreq(freq='30S',method='bfill')
                       S
2017-01-01 00:00:00  0.0
2017-01-01 00:00:30  NaN
2017-01-01 00:01:00  NaN
2017-01-01 00:01:30  2.0
2017-01-01 00:02:00  2.0
2017-01-01 00:02:30  3.0
2017-01-01 00:03:00  3.0

12.asof(where, subset=None),返回非空的行

>>> df.asof(index[0])
S    0.0
Name: 2017-01-01 00:00:00, dtype: float64
>>> df.asof(index)
                       S
2017-01-01 00:00:00  0.0
2017-01-01 00:01:00  0.0
2017-01-01 00:02:00  2.0
2017-01-01 00:03:00  3.0

13.assign(**kwargs),向DataFrame添加新的列,返回一个新的对象包括了原来的列和新增加的列

>>> df=pd.DataFrame({'A':range(1,11),'B':np.random.randn(10)})
>>> df
    A         B
0   1  0.540750
1   2  0.099605
2   3  0.165043
3   4 -1.379514
4   5  0.357865
5   6 -0.060789
6   7 -0.544788
7   8 -0.347995
8   9  0.372269
9  10 -0.212716
>>> df.assign(ln_A=lambda x:np.log(x.A))
    A         B      ln_A
0   1  0.540750  0.000000
1   2  0.099605  0.693147
2   3  0.165043  1.098612
3   4 -1.379514  1.386294
4   5  0.357865  1.609438
5   6 -0.060789  1.791759
6   7 -0.544788  1.945910
7   8 -0.347995  2.079442
8   9  0.372269  2.197225
9  10 -0.212716  2.302585
#每次只能添加一列,之前添加的列会被覆盖
>>> df.assign(abs_B=lambda x:np.abs(x.B))
    A         B     abs_B
0   1  0.540750  0.540750
1   2  0.099605  0.099605
2   3  0.165043  0.165043
3   4 -1.379514  1.379514
4   5  0.357865  0.357865
5   6 -0.060789  0.060789
6   7 -0.544788  0.544788
7   8 -0.347995  0.347995
8   9  0.372269  0.372269
9  10 -0.212716  0.212716

14.astype(dtype, copy=True, errors='raise', **kwargs) 将pandas对象数据类型设置为指定类型

>>> ser=pd.Series([5,6],dtype='int32')
>>> ser
0    5
1    6
dtype: int32
>>> ser.astype('int64')
0    5
1    6
dtype: int64
#转换为类目类型
>>> ser.astype('category')
0    5
1    6
dtype: category
Categories (2, int64): [5, 6]
>>> 
#转换为定制化排序的类目类型
>>> ser.astype('category',ordered=True,categories=[1,2])
0   NaN
1   NaN
dtype: category
Categories (2, int64): [1 < 2]

15. at_time()和between_time() 取某一时刻或某段时间相应的数据

df1.at_time('9:00AM')
Out[115]: 
Empty DataFrame
Columns: [Open, High, Low, Close, Turnover, Volume]
Index: []
df1.between_time('9:00AM','9:30AM')
Out[114]: 
Empty DataFrame
Columns: [Open, High, Low, Close, Turnover, Volume]
Index: []
df1.at_time('00:00AM')
Out[116]: 
            Open  High   Low  Close  Turnover    Volume
Date                                                   
2017-09-15  8.06  8.08  8.03   8.04    195.43  24272800
2017-09-18  8.05  8.13  8.03   8.06    200.76  24867600
2017-09-19  8.03  8.06  7.94   8.00    433.76  54253100
2017-09-20  7.97  8.06  7.95   8.03    319.94  39909700
2017-09-21  8.02  8.10  7.99   8.04    241.94  30056600

16.bfill(axis=None, inplace=False, limit=None, downcast=None)和fillna(method='bfill')效用等同

17.boxplot(column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds)

根据DataFrame的列元素或者可选分组绘制箱线图

df1.boxplot('Open')
Out[117]: <matplotlib.axes._subplots.AxesSubplot at 0x20374716860>
df1.boxplot(['Open','Close'])
Out[118]: <matplotlib.axes._subplots.AxesSubplot at 0x2037477da20>

18.mode(axis=0, numeric_only=False)取众数

Signature: df.mode(axis=0, numeric_only=False)
Docstring:
Gets the mode(s) of each element along the axis selected. Adds a row
for each mode per label, fills in gaps with nan.

Note that there could be multiple values returned for the selected
axis (when more than one item share the maximum frequency), which is
the reason why a dataframe is returned. If you want to impute missing
values with the mode in a dataframe ``df``, you can just do this:
``df.fillna(df.mode().iloc[0])``

Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
    * 0 or 'index' : get mode of each column
    * 1 or 'columns' : get mode of each row
numeric_only : boolean, default False
    if True, only apply to numeric columns
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
                  'B':[3,2,1,7,7,4,4]})
df.mode()
Out[1]:
A   B
0   1.0 4
1   NaN 7
df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
                  'B':[3,2,7,7,7,4,4]})
df.mode()
Out[2]:
A   B
0   1   7
df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
                  'B':[3,1,1,7,7,4,4]})
df.mode()
Out[3]:
A   B
0   1.0 1
1   NaN 4
2   NaN 7
df = pd.DataFrame({'A': [1, 2, 2, 1, 1, 2, 3],
                  'B':[3,2,1,7,6,4,4]})
df.mode()
Out[5]:
A   B
0   1   4.0
1   2   NaN
df = pd.DataFrame({'A': [1, 2, 2, 1, 1, 2, 3],
                  'B':[3,2,1,7,7,4,4]})
df.mode()
Out[6]:
A   B
0   1   4
1   2   7
df = pd.DataFrame({'A': [1, 4, 2, 1, 3, 2, 3],
                  'B':[3,2,7,7,7,4,4]})
df.mode()
Out[8]:
A   B
0   1   7.0
1   2   NaN
2   3   NaN

19 pandas:对dataframe进行groupby后求众数mode

有如下一个dataframe,打算对a的每一个类别求b的众数(mode)
dir(df.groupby('a'))可以看到是没有mode函数的,因此不能直接使用df.groupby('a').mode().reset_index()

注意:.reset_index()是将Series对象转变成DataFrame

既然df.groupby('a')没有mode函数,又考虑到可能聚合函数agg能够接用外部函数,搜索网上的解决办法,经过尝试,在此总结并列出以下几种解决方案(当然也可以直接跳到最后一个解决方案)。

(1) 使用scipy.stats.mode():df中的B类别有两个众数,返回的结果B类别的众数取了较小的结果

>>> from scipy import stats
>>> df.groupby('a').agg(lambda x: stats.mode(x)[0][0]).reset_index()
   a  b
0  A  1
1  B  2

(2) 使用value_counts()

(1) 先看value_counts()的作用:可以看到得到的结果中的index是取值,内容是计数,并且index是降序排列的,因此取index[0]是取最大值,因此有两个众数以上的时候,会取到较大的结果

>>> ss = pd.Series([1,2,2,3,3])
>>> ss
0    1
1    2
2    2
3    3
4    3
dtype: int64
>>> ss.value_counts()
3    2
2    2
1    1
dtype: int64
>>> ss.value_counts().index[0]
3123456789101112131415

(2) 应用到dataframe的groupby之后的聚合函数中:

>>> df.groupby('a').agg(lambda x: x.value_counts().index[1]).reset_index()
   a  b
0  A  1
1  B  3

(3)使用pd.Series.mode():该函数是返回Series的众数的,当众数有多个时,会返回一个list,里面包含了所有众数

>>> df.groupby('a').agg(pd.Series.mode).reset_index()
   a       b
0  A       1
1  B  [2, 3]

(4)使用pd.Series.mode()和np.mean():上述结果显然不是我想要的,但是如果对有多个众数的结果取均值作为新的众数,是较为满意的结果,则可以这样

>>> import numpy as np
>>> df.groupby('a').agg(lambda x: np.mean(pd.Series.mode(x))).reset_index()
  a    b
0  A  1.0
1  B  2.5

20 Pandas 描述统计函数

在进行统计描述时,pandas对三个数据对象的轴参数规定如下:
Series: 没有轴参数
DataFrame: “index” (axis=0, default), “columns” (axis=1)
Panel: “items” (axis=0), “major” (axis=1, default), “minor” (axis=2)
统计描述参数如下

df.count() #非空元素计算 
df.min() #最小值 
df.max() #最大值 
df.idxmin() #最小值的位置,类似于R中的which.min函数 
df.idxmax() #最大值的位置,类似于R中的which.max函数 
df.quantile(0.1) #10%分位数 
df.sum() #求和 
df.mean() #均值 
df.median() #中位数 
df.mode() #众数 
df.var() #方差 
df.std() #标准差 
df.mad() #平均绝对偏差 
df.skew() #偏度 
df.kurt() #峰度 
df.describe() #一次性输出多个描述性统计指标


 count
  Number of non-null observations
  观测值的个数


  sum
  Sum of values
  求和


  mean
  Mean of values
  求平均值


  mad
  Mean absolute deviation
  平均绝对方差


  median
  Arithmetic median of values
  中位数


  min
  Minimum
  最小值


  max
  Maximum
  最大值


  mode
  Mode
  众数


  abs
  Absolute Value
  绝对值


  prod
  Product of values
  乘积


  std
  Bessel-corrected sample standard deviation
  标准差


  var
  Unbiased variance
  方差


  sem
  Standard error of the mean
  标准误


  skew
  Sample skewness (3rd moment)
  偏度系数


  kurt
  Sample kurtosis (4th moment)
  峰度


  quantile
  Sample quantile (value at %)
  分位数


  cumsum
  Cumulative sum
  累加


  cumprod
  Cumulative product
  累乘


  cummax
  Cumulative maximum
  累最大值


  cummin
  Cumulative minimum
  累最小值


  cov()
  covariance
  协方差


  corr()
  correlation
  相关系数


  rank()
  rank by values
  排名


  pct_change()
  time change
  时间序列变

21 pandas replace函数使用小结

import numpy as np
import pandas as pd
df = pd.read_csv('emp.csv')
df
#Series对象值替换
s = df.iloc[2]#获取行索引为2数据
#单值替换
s.replace('?',np.nan)#用np.nan替换?
s.replace({'?':'NA'})#用NA替换?
#多值替换
s.replace(['?',r'$'],[np.nan,'NA'])#列表值替换
s.replace({'?':np.nan,'$':'NA'})#字典映射
#同缺失值填充方法类似
s.replace(['?','$'],method='pad')#向前填充
s.replace(['?','$'],method='ffill')#向前填充
s.replace(['?','$'],method='bfill')#向后填充
#limit参数控制填充次数
s.replace(['?','$'],method='bfill',limit=1)
#DataFrame对象值替换
#单值替换
df.replace('?',np.nan)#用np.nan替换?
df.replace({'?':'NA'})#用NA替换?
#按列指定单值替换
df.replace({'EMPNO':'?'},np.nan)#用np.nan替换EMPNO列中?
df.replace({'EMPNO':'?','ENAME':'.'},np.nan)#用np.nan替换EMPNO列中?和ENAME中.
#多值替换
df.replace(['?','.','$'],[np.nan,'NA','None'])##用np.nan替换?用NA替换. 用None替换$
df.replace({'?':'NA','$':None})#用NA替换? 用None替换$
df.replace({'?','$'},{'NA',None})#用NA替换? 用None替换$
#正则替换
df.replace(r'\?|\.|\$',np.nan,regex=True)#用np.nan替换?或.或$原字符
df.replace([r'\?',r'\$'],np.nan,regex=True)#用np.nan替换?和$
df.replace([r'\?',r'\$'],[np.nan,'NA'],regex=True)#用np.nan替换?用NA替换$符号
df.replace(regex={r'\?':None})
#value参数显示传递
df.replace(regex=[r'\?|\.|\$'],value=np.nan)#用np.nan替换?或.或$原字符
df['BrandName'].replace(
    to_replace=['ABC', 'AB'],
    value='A',
    inplace=True
)
上一篇 下一篇

猜你喜欢

热点阅读