Pandas学习笔记1

2019-04-20  本文已影响0人  山雾幻华
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np,np.nan,44,1])
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
df2 = pd.DataFrame({"A":1.,"B":pd.Categorical(["test",'tesst'])})
print('S',s)
print('data',dates)
print('df',df)
print('df1',df1)
print('df1',df2)
S 0                                                    1
1                                                    3
2                                                    6
3    <module 'numpy' from 'E:\\program\\Anaconda3\\...
4                                                  NaN
5                                                   44
6                                                    1
dtype: object
data DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')
df                    a         b         c         d
2016-01-01 -1.018450  0.116236  0.171505 -0.989516
2016-01-02  0.608064 -0.523510  0.335354 -0.895313
2016-01-03  2.804357 -0.105679 -0.821816 -0.340312
2016-01-04  1.239038 -1.592973 -0.060665 -1.785517
2016-01-05 -0.307189 -1.255811 -0.420571  1.545740
2016-01-06 -1.469857  2.646400  0.101558  0.438751
df1    0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
df1      A      B
0  1.0   test
1  1.0  tesst
print("df2.dtypes",df2.dtypes)
print("df2.index",df2.index)
print("df2.columns",df2.columns)
print("df2.values",df2.values)
print("df2.describe",df2.describe())
print("df2.T",df2.T)
print("df2.sort_index",df2.sort_index(axis=1,ascending=False))
print("df2.sort_index",df2.sort_values(by='B'))
df2.dtypes A     float64
B    category
dtype: object
df2.index RangeIndex(start=0, stop=2, step=1)
df2.columns Index(['A', 'B'], dtype='object')
df2.values [[1.0 'test']
 [1.0 'tesst']]
df2.describe          A
count  2.0
mean   1.0
std    0.0
min    1.0
25%    1.0
50%    1.0
75%    1.0
max    1.0
df2.T       0      1
A     1      1
B  test  tesst
df2.sort_index        B    A
0   test  1.0
1  tesst  1.0
df2.sort_index      A      B
1  1.0  tesst
0  1.0   test

选择数据

dates=pd.date_range('20130101',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
print(df['A'],df.A)
print(df[0:3],'\n\n',df["20130102":"20130104"])
# select by label:loc
print(df.loc['20130102'])
print(df.loc[:,["A","B"]])
# select by position:iloc
print(df.iloc[3,1])
print(df.iloc[3:5,1])
# mixed selection:ix
print(df.ix[:3,['A','C']])
#Boolean indexing
print(df[df.A>8])
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32 2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32
            A  B   C   D
2013-01-01  0  1   2   3
2013-01-02  4  5   6   7
2013-01-03  8  9  10  11 

              A   B   C   D
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
A    4
B    5
C    6
D    7
Name: 2013-01-02 00:00:00, dtype: int32
             A   B
2013-01-01   0   1
2013-01-02   4   5
2013-01-03   8   9
2013-01-04  12  13
2013-01-05  16  17
2013-01-06  20  21
13
2013-01-04    13
2013-01-05    17
Freq: D, Name: B, dtype: int32
            A   C
2013-01-01  0   2
2013-01-02  4   6
2013-01-03  8  10
             A   B   C   D
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23


E:\program\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]

设置值

df.iloc[2,2] = 11111
df['Z'] = 0
df["E"]=pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df)

             A   B      C   D  Z  E
2013-01-01   0   1      2   3  0  1
2013-01-02   4   5      6   7  0  2
2013-01-03   8   9  11111  11  0  3
2013-01-04  12  13     14  15  0  4
2013-01-05  16  17     18  19  0  5
2013-01-06  20  21     22  23  0  6

处理丢失值

dates=pd. date_range('20130101', periods=6)
df=pd. DataFrame(np. arange(24). reshape((6,4)), index=dates, columns=['A','B','C','D'])
df.iloc[0,1]=np. nan
df.iloc[1,2]=np. nan 
print(df)
print(df.dropna())#丢掉行
print(df.dropna(axis=0,how='any'))#丢掉行how={fany','all}

print(df.fillna(value=0))#填充0

print(np.any(df.isna()) == True)#判断有无null
             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
             A     B     C   D
2013-01-01   0   0.0   2.0   3
2013-01-02   4   5.0   0.0   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
True

导入导出

read_csv
read_excel
read_hdf
read_sql
read_json
read_msgpack(experimental)
read_html
read_gbg(experimental)
read_stata
read_sas
read_clijpboard
read_pickle

to_csv
to_excel
to_hdf
to_sql
to_json
to_msgpack(experimental)
to_html
to_gbg(experimental)
to_stata
to_sas
to_clijpboard
to_pickle

合并

# concatenating 
df1=pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)

# join,['inner','outer']
df1=pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'])
res1 = pd.concat([df1,df2],join='outer',ignore_index=True)
res2 = pd.concat([df1,df2],join='inner',ignore_index=True)
print(res1,res2)

# join_axes 
dfl=pd. DataFrame(np. ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2=pd. DataFrame(np. ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res=pd. concat([ df1, df2], axis=1, join_axes=[ df1. index])
print(res)

# append 
df1=pd. DataFrame(np. ones((3,4))*0, columns=['a','b','c','d'])
df2=pd. DataFrame(np. ones((3,4))*1, columns=['a','b','c','d'])
df3=pd. DataFrame(np. ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res=dfl. append(df2, ignore_index=True)
res=dfl. append([ df2, df3])
print(res)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0


E:\program\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  # This is added back by InteractiveShellApp.init_path()


     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0  1.0      b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
     a    b    c    d    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
0  1.0  1.0  1.0  1.0  NaN
1  1.0  1.0  1.0  1.0  NaN
2  1.0  1.0  1.0  1.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0


E:\program\Anaconda3\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)
# merging two df by key/keys.(may be used in database)
# simple example 
left=pd. DataFrame({"key":['KO','K1','K2','K3'],
                    'A':['A0','A1','A2','A3'], 
                    'B':['BO','B1','B2','B3']})
right=pd. DataFrame({"key":['K0','K1','K2','K3'],
                     'C':['CO','C1','C2','C3'],
                     'D':['DO','D1','D2','D3']})
print(left,right)

res = pd.merge(left,right,on='key')
print(res)

# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                             'key2': ['K0', 'K1', 'K0', 'K1'],
                             'A': ['A0', 'A1', 'A2', 'A3'],
                             'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                              'key2': ['K0', 'K0', 'K0', 'K0'],
                              'C': ['C0', 'C1', 'C2', 'C3'],
                              'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')  # default for how='inner'
print(res)
# how = ['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)

# indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')

# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                                  'B': ['B0', 'B1', 'B2']},
                                  index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                                     'D': ['D0', 'D2', 'D3']},
                                      index=['K0', 'K2', 'K3'])
print(left)
print(right)
# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)

# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)

# join function in pandas is similar with merge. If know merge, you will understand join
  key   A   B
0  KO  A0  BO
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3   key   C   D
0  K0  CO  DO
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
  key   A   B   C   D
0  K1  A1  B1  C1  D1
1  K2  A2  B2  C2  D2
2  K3  A3  B3  C3  D3
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
   col1 col_left
0     0        a
1     1        b
   col1  col_right
0     1          2
1     2          2
2     2          2
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2
    k  age_boy  age_girl
0  K0        1         4
1  K0        1         5

plt画图

import matplotlib.pyplot as plt
# plot data

# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
##data.plot()

# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)

plt.show() 

参考

https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/

上一篇下一篇

猜你喜欢

热点阅读