Pandas 处理丢失数据
2019-08-15 本文已影响0人
李小夭
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates,columns = ['A','B','C','D'])
df
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
dropna:如有nan,按行(axis=0)或按列(axis=1)drop
how = {'any','all'}
print(df.dropna(axis = 0, how='any'))
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
print(df.dropna(axis = 1, how='any'))
A D
2013-01-01 0 3
2013-01-02 4 7
2013-01-03 8 11
2013-01-04 12 15
2013-01-05 16 19
2013-01-06 20 23
fillna:将nan替换成其他值
print(df.fillna(value=0))
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
isnull:检查是否有nan
print(df.isnull())
A B C D
2013-01-01 False True False False
2013-01-02 False False True False
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False
# 是否至少包含一个True(用于数据量较大排查是否存在至少一个nan值)
print(np.any(df.isnull()) == True)
True