11. 日月光华 Python数据分析-Pandas-布尔过滤和
2023-07-10 本文已影响0人
薛东弗斯
import numpy as np
import pandas as pd
data = pd.Series(np.random.randn(1000))
data.describe()
# count 1000.000000
# mean 0.037036
# std 1.004643
# min -3.185389
# 25% -0.678306
# 50% 0.000934
# 75% 0.696469
# max 3.278374
# dtype: float64
np.abs(data) > 3 # 提取绝对值大于3的值
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# ...
# 995 False
# 996 False
# 997 False
# 998 False
# 999 False
# Length: 1000, dtype: bool
data[np.abs(data) > 3] # 提取绝对值大于3的值
# 290 3.260396
# 432 3.278374
# 842 -3.185389
# 864 3.155570
# 869 3.126912
# dtype: float64
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.008929 -0.018384 0.010087 0.007836
# std 0.985952 1.016388 0.976499 0.999119
# min -3.635728 -3.422629 -2.759628 -3.317702
# 25% -0.652795 -0.716052 -0.655192 -0.674227
# 50% -0.020611 -0.042399 -0.033997 -0.020016
# 75% 0.670707 0.682946 0.665001 0.664010
# max 3.221941 3.473221 3.360565 3.454454
(np.abs(data)>3)
# 0 1 2 3
# 0 False False False False
# 1 False False False False
# 2 False False False False
# 3 False False False False
# 4 False False False False
# ... ... ... ... ...
# 995 False False False False
# 996 False False False False
# 997 False False False False
# 998 False False False False
# 999 False False False False
# 1000 rows × 4 columns
data[(np.abs(data)>3).any(1)] # 返回包含绝对值大于3的行。 any方法对一行所有制进行判断,只要有一个满足就返回
# 0 1 2 3
# 145 -0.410839 -3.462612 -0.111049 1.429950
# 360 1.008677 -1.389130 -0.836342 -3.423012
# 432 -0.234295 -0.456313 3.513138 -0.874108
# 588 -0.850638 -3.243294 0.853198 0.043245
# 799 -1.458603 0.708436 1.633709 -3.273133
# 955 1.083687 0.254223 -1.068372 -3.545360
any([True,False,False]) # any方法,有一个为True则返回True
# True
data[(np.abs(data)>1).all(1)] # all的方法,只有全部为True,才返回True
# 0 1 2 3
# 251 1.320847 1.332232 -1.012046 1.707501
# 313 -1.540200 -1.720505 -1.470236 -1.548243
# 345 -1.718806 -1.088927 -1.060789 -2.083825
# 590 1.590940 -1.526003 2.246264 1.192285
# 628 -1.336490 1.139275 -1.697524 1.188453
# 978 1.037838 -1.842687 1.069139 -1.607545
all([True, False, False])
# False
np.sign(data).head() # np.sign用于查看数据形状,大于0置为1,小于0置为-1
# 0 1 2 3
# 0 -1.0 -1.0 -1.0 -1.0
# 1 1.0 1.0 1.0 1.0
# 2 -1.0 1.0 1.0 1.0
# 3 1.0 -1.0 1.0 1.0
# 4 1.0 -1.0 1.0 1.0
# 为了便于分析,连续数据常常被离散化
data = pd.DataFrame(np.random.randint(1, 100, (50, 2)))
bins = [0, 20, 40, 60, 80, 100] # 分段规则
cats = pd.cut(data.iloc[:, 0], bins=bins, right=False) # 取第0列; right=False 用于设置右边为开区间
cats
# 0 [0, 20)
# 1 [0, 20)
# 2 [60, 80)
# 3 [40, 60)
# 4 [20, 40)
# 5 [60, 80)
# 6 [40, 60)
# 7 [0, 20)
# 8 [40, 60)
# 9 [40, 60)
# 10 [40, 60)
# 11 [80, 100)
# 12 [0, 20)
# 13 [0, 20)
# 14 [80, 100)
# 15 [60, 80)
# 16 [20, 40)
# 17 [40, 60)
# 18 [40, 60)
# 19 [20, 40)
# 20 [80, 100)
# 21 [0, 20)
# 22 [40, 60)
# 23 [0, 20)
# 24 [20, 40)
# 25 [80, 100)
# 26 [0, 20)
# 27 [60, 80)
# 28 [80, 100)
# 29 [0, 20)
# 30 [20, 40)
# 31 [80, 100)
# 32 [20, 40)
# 33 [80, 100)
# 34 [0, 20)
# 35 [20, 40)
# 36 [0, 20)
# 37 [60, 80)
# 38 [80, 100)
# 39 [80, 100)
# 40 [40, 60)
# 41 [20, 40)
# 42 [0, 20)
# 43 [80, 100)
# 44 [20, 40)
# 45 [40, 60)
# 46 [60, 80)
# 47 [20, 40)
# 48 [20, 40)
# 49 [80, 100)
# Name: 0, dtype: category
Categories (5, interval[int64, left]): [[0, 20) < [20, 40) < [40, 60) < [60, 80) < [80, 100)]
cats.value_counts() # 返回每个区域分布的数字,按照从高到低排列
# [0, 20) 12
# [20, 40) 11
# [80, 100) 11
# [40, 60) 10
# [60, 80) 6
# Name: 0, dtype: int64
pd.cut(data.iloc[:, 0], 4) # 直接划分为4部分,平均划分
# 0 (0.903, 25.25]
# 1 (0.903, 25.25]
# 2 (49.5, 73.75]
# 3 (49.5, 73.75]
# 4 (0.903, 25.25]
# 5 (73.75, 98.0]
# 6 (49.5, 73.75]
# 7 (0.903, 25.25]
# 8 (25.25, 49.5]
# 9 (49.5, 73.75]
# 10 (49.5, 73.75]
# 11 (73.75, 98.0]
# 12 (0.903, 25.25]
# 13 (0.903, 25.25]
# 14 (73.75, 98.0]
# 15 (49.5, 73.75]
# 16 (25.25, 49.5]
# 17 (49.5, 73.75]
# 18 (25.25, 49.5]
# 19 (25.25, 49.5]
# 20 (73.75, 98.0]
# 21 (0.903, 25.25]
# 22 (25.25, 49.5]
# 23 (0.903, 25.25]
# 24 (0.903, 25.25]
# 25 (73.75, 98.0]
# 26 (0.903, 25.25]
# 27 (49.5, 73.75]
# 28 (73.75, 98.0]
# 29 (0.903, 25.25]
# 30 (25.25, 49.5]
# 31 (73.75, 98.0]
# 32 (25.25, 49.5]
# 33 (73.75, 98.0]
# 34 (0.903, 25.25]
# 35 (25.25, 49.5]
#36 (0.903, 25.25]
# 37 (49.5, 73.75]
# 38 (73.75, 98.0]
# 39 (73.75, 98.0]
# 40 (25.25, 49.5]
# 41 (0.903, 25.25]
# 42 (0.903, 25.25]
# 43 (73.75, 98.0]
# 44 (0.903, 25.25]
# 45 (49.5, 73.75]
# 46 (49.5, 73.75]
# 47 (25.25, 49.5]
# 48 (0.903, 25.25]
# 49 (73.75, 98.0]
# Name: 0, dtype: category
# Categories (4, interval[float64, right]): [(0.903, 25.25] < (25.25, 49.5] < (49.5, 73.75] < (73.75, 98.0]]
pd.qcut(data.iloc[:, 0], 4).value_counts() # pd.qcute 按照样本数等分为4份
# (0.999, 20.5] 13
# (72.75, 98.0] 13
# (20.5, 45.5] 12
# (45.5, 72.75] 12
# Name: 0, dtype: int64