Pandas03:DataFrame&Series

2020-03-28  本文已影响0人  罗泽坤

selecting and indexing data

一、Construction of Series and DataFrame

只列举出常用操作具体内容参见官方文档
from pandas import DataFrame,Series
x1 = Series([1,2,3,4])
x2 = Series(data=[1,2,3,4],index=['a','b','c','d'])
Dict = {'a':1,'b':2,'c':3,'d':4}
x3 = Series(Dict)
print(x1)
print(x2)
print(x3)
0    1
1    2
2    3
3    4
dtype: int64
a    1
b    2
c    3
d    4
dtype: int64
a    1
b    2
c    3
d    4
dtype: int64
# dataframe使用
from pandas import Series, DataFrame
data = {'Chinese': [66, 95, 93, 90,80], 'Math': [30, 98, 96, 77, 90], 'English': [65, 85, 92, 88, 90]}
df1 = DataFrame(data)
df2 = DataFrame(data, index=['ZhangFei', 'GuanYu', 'LiuBei', 'DianWei', 'XuChu'], columns=['Chinese', 'Math', 'English'])
print(df1)
print(df2)

# 对列名进行更换
df2.rename(columns={'Chinese': '语文', 'English': '英语', 'Math': '数学'}, inplace = True)
print(df2.isnull())  #检验是否有缺失值
# 输出df2的概要输出均值,标准差,最大值,最小值,各种分位数
print(df2.describe())
   Chinese  Math  English
0       66    30       65
1       95    98       85
2       93    96       92
3       90    77       88
4       80    90       90
          Chinese  Math  English
ZhangFei       66    30       65
GuanYu         95    98       85
LiuBei         93    96       92
DianWei        90    77       88
XuChu          80    90       90
             语文     数学     英语
ZhangFei  False  False  False
GuanYu    False  False  False
LiuBei    False  False  False
DianWei   False  False  False
XuChu     False  False  False
              语文         数学         英语
count   5.000000   5.000000   5.000000
mean   84.800000  78.200000  84.000000
std    11.987493  28.163807  10.931606
min    66.000000  30.000000  65.000000
25%    80.000000  77.000000  85.000000
50%    90.000000  90.000000  88.000000
75%    93.000000  96.000000  90.000000
max    95.000000  98.000000  92.000000

二、indexing

import pandas as pd
import numpy as np

datas = pd.date_range('28/3/2020',periods = 8)
df = pd.DataFrame(data = np.random.randn(8,4),index=datas,columns=['A','B','C','D'])
print(df)
a = df['A']   # 此种形式DataFrame索引为series
print()
print(a)
print()
print(a[datas[5]])
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31  2.107192  1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
2020-04-03 -1.193671  2.798576  3.594377 -0.016369
2020-04-04  1.592835 -0.351965  1.728636  0.547841

2020-03-28    0.040467
2020-03-29   -0.591560
2020-03-30   -0.201351
2020-03-31    2.107192
2020-04-01   -0.263185
2020-04-02   -0.668462
2020-04-03   -1.193671
2020-04-04    1.592835
Freq: D, Name: A, dtype: float64

-0.6684622593090315
print(df)
print(df[['A','B']])  # indexing A B 两列
print(df[:3]) #索引切片行支持反向和步长
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31  2.107192  1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
2020-04-03 -1.193671  2.798576  3.594377 -0.016369
2020-04-04  1.592835 -0.351965  1.728636  0.547841
                   A         B
2020-03-28  0.040467  2.502838
2020-03-29 -0.591560 -0.607783
2020-03-30 -0.201351 -1.134350
2020-03-31  2.107192  1.267613
2020-04-01 -0.263185 -0.974481
2020-04-02 -0.668462 -0.668671
2020-04-03 -1.193671  2.798576
2020-04-04  1.592835 -0.351965
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
# 使用loc函数索引行列组合,对series而言索引行
print(df)
print(df.loc[:,['A','B']])  
#此种形式错误print(df[[:],['A','B']])
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31  2.107192  1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
2020-04-03 -1.193671  2.798576  3.594377 -0.016369
2020-04-04  1.592835 -0.351965  1.728636  0.547841
                   A         B
2020-03-28  0.040467  2.502838
2020-03-29 -0.591560 -0.607783
2020-03-30 -0.201351 -1.134350
2020-03-31  2.107192  1.267613
2020-04-01 -0.263185 -0.974481
2020-04-02 -0.668462 -0.668671
2020-04-03 -1.193671  2.798576
2020-04-04  1.592835 -0.351965

利用索引修改series和DataFrame值

sa = pd.Series(range(4),list('abcd'))
dfa = df.copy()
print(sa)
print(dfa)
a    0
b    1
c    2
d    3
dtype: int64
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31  2.107192  1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
2020-04-03 -1.193671  2.798576  3.594377 -0.016369
2020-04-04  1.592835 -0.351965  1.728636  0.547841
sa['a']=100
print(sa)
dfa['A'] = list(range(len(dfa.index)))
print(dfa)
a    100
b      1
c      2
d      3
dtype: int64
            A         B         C         D
2020-03-28  0  2.502838  1.750702  0.752269
2020-03-29  1 -0.607783  0.390446  0.759989
2020-03-30  2 -1.134350 -0.912988 -0.313645
2020-03-31  3  1.267613 -0.421305 -0.286911
2020-04-01  4 -0.974481 -0.655299 -0.672898
2020-04-02  5 -0.668671  0.525188 -0.134835
2020-04-03  6  2.798576  3.594377 -0.016369
2020-04-04  7 -0.351965  1.728636  0.547841
x = pd.DataFrame({'y':[1,2,3],'z':[4,5,6]})
print(x)
print(x.iloc[1]) #iloc[:]=DataFrame[:]
x.iloc[1]=[20,50]
print(x)
   y  z
0  1  4
1  2  5
2  3  6
   y  z
1  2  5
2  3  6
    y   z
0   1   4
1  20  50
2   3   6
#boolean indexing
print(df)
print(df>0)  # 返回bool类型
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31  2.107192  1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
2020-04-03 -1.193671  2.798576  3.594377 -0.016369
2020-04-04  1.592835 -0.351965  1.728636  0.547841
                A      B      C      D
2020-03-28   True   True   True   True
2020-03-29  False  False   True   True
2020-03-30  False  False  False  False
2020-03-31   True   True  False  False
2020-04-01  False  False  False  False
2020-04-02  False  False   True  False
2020-04-03  False   True   True  False
2020-04-04   True  False   True   True
dfA = df['A']
print(dfA)
print(dfA>0)
2020-03-28    0.040467
2020-03-29   -0.591560
2020-03-30   -0.201351
2020-03-31    2.107192
2020-04-01   -0.263185
2020-04-02   -0.668462
2020-04-03   -1.193671
2020-04-04    1.592835
Freq: D, Name: A, dtype: float64
2020-03-28     True
2020-03-29    False
2020-03-30    False
2020-03-31     True
2020-04-01    False
2020-04-02    False
2020-04-03    False
2020-04-04     True
Freq: D, Name: A, dtype: bool
print(df)
print(df.loc['2020-03-29'])
print(df.loc[:,df.loc['2020-03-29']>0])
                   A         B         C         D
2020-03-28  0.040467  2.502838  1.750702  0.752269
2020-03-29 -0.591560 -0.607783  0.390446  0.759989
2020-03-30 -0.201351 -1.134350 -0.912988 -0.313645
2020-03-31  2.107192  1.267613 -0.421305 -0.286911
2020-04-01 -0.263185 -0.974481 -0.655299 -0.672898
2020-04-02 -0.668462 -0.668671  0.525188 -0.134835
2020-04-03 -1.193671  2.798576  3.594377 -0.016369
2020-04-04  1.592835 -0.351965  1.728636  0.547841
A   -0.591560
B   -0.607783
C    0.390446
D    0.759989
Name: 2020-03-29 00:00:00, dtype: float64
                   C         D
2020-03-28  1.750702  0.752269
2020-03-29  0.390446  0.759989
2020-03-30 -0.912988 -0.313645
2020-03-31 -0.421305 -0.286911
2020-04-01 -0.655299 -0.672898
2020-04-02  0.525188 -0.134835
2020-04-03  3.594377 -0.016369
2020-04-04  1.728636  0.547841
|-iloc与loc的差别:
    loc的索引和切片只能通过对应的indexs和columns值进行索引
    而iloc系统会从0开始indexs和columns设置下标因此可以直接通过下标
    进行索引和切片例子如下
S1 = pd.Series(data=list(range(0,8,2)),index=list('abcd'))
print(S1)
df1 = pd.DataFrame(data=np.random.randn(4,6),index=list('abcd'),columns=list(range(2,14,2)))
print(df1)
a    0
b    2
c    4
d    6
dtype: int64
         2         4         6         8         10        12
a -2.445292  0.048598  0.050947 -0.713184  2.017222 -1.389391
b  1.909918 -1.212520  0.552249  1.115173 -0.024809 -0.192347
c -0.776439  0.877586  0.569017 -1.741527 -0.022756  0.154204
d  0.537282  2.366709 -0.606037  0.860133 -0.707234 -0.297887
print(S1.iloc[1:])
#此方式不行print(S1.loc[1:])
print(df1.iloc[1:4,2:5])
#此方式不行print(df1.loc[1:4,2:5])
print(df1.loc['b':,6:10])  #这个与上式等价
b    2
c    4
d    6
dtype: int64
         6         8         10
b  0.552249  1.115173 -0.024809
c  0.569017 -1.741527 -0.022756
d -0.606037  0.860133 -0.707234
         6         8         10
b  0.552249  1.115173 -0.024809
c  0.569017 -1.741527 -0.022756
d -0.606037  0.860133 -0.707234

GitHub

https://github.com/luozekun1230/MyPyhonProgram/tree/master/Pandas

上一篇 下一篇

猜你喜欢

热点阅读