9. 日月光华 Python数据分析-Pandas-数据清理
2023-07-09 本文已影响0人
薛东弗斯
重复值处理
import pandas as pd
import numpy as np
data = pd.DataFrame({'k1':['one']*3 + ['two']*2, 'k2':[1,1,2,3,3]})
data
# k1 k2
# 0 one 1
# 1 one 1
# 2 one 2
# 3 two 3
# 4 two 3
data.duplicated()
# 0 False
# 1 True
# 2 False
# 3 False
# 4 True
# dtype: bool
(data.duplicated()).sum()
# 2
data.drop_duplicates() # 抛弃重复值
# k1 k2
# 0 one 1
# 2 one 2
# 3 two 3
data.drop_duplicates('k1') # 只关系k1这一列
# k1 k2
# 0 one 1
# 3 two 3
data['k3'] = 1
data
# k1 k2 k3
# 0 one 1 1
# 1 one 1 1
# 2 one 2 1
# 3 two 3 1
# 4 two 3 1
data.duplicated(['k1', 'k3'])
# 0 False
# 1 True
# 2 True
# 3 False
# 4 True
# dtype: bool
data.drop_duplicates(['k1', 'k3'], inplace=True) # 默认保留第一列
data
# k1 k2 k3
# 0 one 1 1
# 3 two 3 1
数值替换
data
# k1 k2 k3
# 0 one 1 1
# 1 one 1 1
# 2 one 2 1
# 3 two 3 1
# 4 two 3 1
data.k1.replace('two', 'three')
# 0 one
# 1 one
# 2 one
# 3 three
# 4 three
# Name: k1, dtype: object
data.replace(1, 100, inplace=True)
data
# k1 k2 k3
# 0 one 100 100
# 1 one 100 100
# 2 one 2 100
# 3 two 3 100
# 4 two 3 100
过滤缺失值
data = pd.Series([2, np.nan, 4, np.nan,8.5]) # 创建一维数组
data
# 0 2.0
# 1 NaN
# 2 4.0
# 3 NaN
# 4 8.5
# dtype: float64
(data.isnull()).sum()
# 2
data.notnull()
# 3
data[data.notnull()]
# 0 2.0
# 2 4.0
# 4 8.5
# dtype: float64
data.dropna(inplace=True)
data
# 0 2.0
# 2 4.0
# 4 8.5
# dtype: float64
data = pd.DataFrame(np.random.randn(4,3),index = list('abcd'),columns=['aa','bb','cc'])
data.iloc[1:,:2] = np.nan
data.iloc[1,2] = np.nan
data
# aa bb cc
# a -0.184374 -2.293165 -0.710435
# b NaN NaN NaN
# c NaN NaN -0.585972
# d NaN NaN 0.126403
data.isnull()
# aa bb cc
# a False False False
# b True True True
# c True True False
# d True True False
data.dropna() # 只要有一个缺失值,这行就会被全部抛弃
# aa bb cc
# a -0.674245 -0.416114 -0.79495
data.dropna(how='all') # 只有这一行全部是缺失值时,才会被抛弃
# aa bb cc
# a -0.674245 -0.416114 -0.794950
# c NaN NaN 0.453766
# d NaN NaN -3.152937
data
# aa bb cc
# a -0.674245 -0.416114 -0.794950
# b NaN NaN NaN
# c NaN NaN 0.453766
# d NaN NaN -3.152937
data.iloc[0, 0] = np.nan
data
# aa bb cc
# a NaN -2.293165 -0.710435
# b NaN NaN NaN
# c NaN NaN -0.585972
# d NaN NaN 0.126403
data.dropna(axis=1, how='all') # 只抛弃某一列中,全部为NaN的被抛弃。 默认以行为基准。
# bb cc
# a -2.293165 -0.710435
# b NaN NaN
# c NaN -0.585972
# d NaN 0.126403
填充缺失值
data
# aa bb cc
# a NaN -0.416114 -0.794950
# b NaN NaN NaN
# c NaN NaN 0.453766
# d NaN NaN -3.152937
data.fillna({'aa': 1, 'bb': 100})
# aa bb cc
# a 1.0 -0.416114 -0.794950
# b 1.0 100.000000 NaN
# c 1.0 100.000000 0.453766
# d 1.0 100.000000 -3.152937
data = pd.Series([2, np.nan, 4, np.nan,8.5])
data.fillna(data.mean())
# 0 2.000000
# 1 4.833333
# 2 4.000000
# 3 4.833333
# 4 8.500000
# dtype: float64