9. 日月光华 Python数据分析-Pandas-数据清理

2023-07-09  本文已影响0人  薛东弗斯

重复值处理

import pandas as pd
import numpy as np

data = pd.DataFrame({'k1':['one']*3 + ['two']*2, 'k2':[1,1,2,3,3]})
data
#   k1  k2
# 0 one 1
# 1 one 1
# 2 one 2
# 3 two 3
# 4 two 3

data.duplicated()
# 0    False
# 1     True
# 2    False
# 3    False
# 4     True
# dtype: bool

(data.duplicated()).sum()
# 2

data.drop_duplicates()   # 抛弃重复值
# k1    k2
# 0 one 1
# 2 one 2
# 3 two 3

data.drop_duplicates('k1')  # 只关系k1这一列
#   k1  k2
# 0 one 1
# 3 two 3

data['k3'] = 1
data
#   k1  k2  k3
# 0 one 1   1
# 1 one 1   1
# 2 one 2   1
# 3 two 3   1
# 4 two 3   1

data.duplicated(['k1', 'k3'])
# 0    False
# 1     True
# 2     True
# 3    False
# 4     True
# dtype: bool

data.drop_duplicates(['k1', 'k3'], inplace=True)   # 默认保留第一列
data
#   k1  k2  k3
# 0 one 1   1
# 3 two 3   1

数值替换

data
#   k1  k2  k3
# 0 one 1   1
# 1 one 1   1
# 2 one 2   1
# 3 two 3   1
# 4 two 3   1

data.k1.replace('two', 'three')
# 0      one
# 1      one
# 2      one
# 3    three
# 4    three
# Name: k1, dtype: object

data.replace(1, 100, inplace=True)
data
# k1    k2  k3
# 0 one 100 100
# 1 one 100 100
# 2 one 2   100
# 3 two 3   100
# 4 two 3   100

过滤缺失值

data = pd.Series([2, np.nan, 4, np.nan,8.5])   # 创建一维数组
data
# 0    2.0
# 1    NaN
# 2    4.0
# 3    NaN
# 4    8.5
# dtype: float64

(data.isnull()).sum()
# 2

data.notnull()
# 3

data[data.notnull()]
# 0    2.0
# 2    4.0
# 4    8.5
# dtype: float64

data.dropna(inplace=True)
data
# 0    2.0
# 2    4.0
# 4    8.5
# dtype: float64

data = pd.DataFrame(np.random.randn(4,3),index = list('abcd'),columns=['aa','bb','cc'])
data.iloc[1:,:2] = np.nan
data.iloc[1,2] = np.nan
data
#   aa                  bb                  cc
# a -0.184374   -2.293165   -0.710435
# b NaN             NaN                 NaN
# c NaN             NaN                 -0.585972
# d NaN             NaN                 0.126403

data.isnull()
#         aa            bb          cc
# a False   False   False
# b True            True        True
# c True            True    False
# d True            True            False

data.dropna()   # 只要有一个缺失值,这行就会被全部抛弃
#       aa              bb              cc
# a -0.674245   -0.416114   -0.79495

data.dropna(how='all')   # 只有这一行全部是缺失值时,才会被抛弃
#   aa              bb              cc
# a -0.674245   -0.416114   -0.794950
# c NaN             NaN             0.453766
# d NaN             NaN             -3.152937

data
#       aa              bb                  cc
# a -0.674245   -0.416114   -0.794950
# b NaN             NaN             NaN
# c NaN             NaN             0.453766
# d NaN             NaN             -3.152937

data.iloc[0, 0] = np.nan
data
#   aa          bb              cc
# a NaN     -2.293165   -0.710435
# b NaN     NaN             NaN
# c NaN     NaN             -0.585972
# d NaN     NaN             0.126403

data.dropna(axis=1, how='all')  # 只抛弃某一列中,全部为NaN的被抛弃。  默认以行为基准。
#   bb              cc
# a -2.293165   -0.710435
# b NaN             NaN
# c NaN             -0.585972
# d NaN             0.126403

填充缺失值

data
#       aa  bb              cc
# a NaN -0.416114   -0.794950
# b NaN NaN             NaN
# c NaN NaN             0.453766
# d NaN NaN             -3.152937

data.fillna({'aa': 1, 'bb': 100})
#   aa  bb                  cc
# a 1.0 -0.416114   -0.794950
# b 1.0 100.000000  NaN
# c 1.0 100.000000  0.453766
# d 1.0 100.000000  -3.152937

data = pd.Series([2, np.nan, 4, np.nan,8.5])
data.fillna(data.mean())
# 0    2.000000
# 1    4.833333
# 2    4.000000
# 3    4.833333
# 4    8.500000
# dtype: float64
上一篇下一篇

猜你喜欢

热点阅读