Pandas

pandas使用习惯示例总结续

2019-01-18  本文已影响115人  筝韵徽
import pandas as pd
import numpy as np

pandas使用习惯总结续

df = pd.read_csv('data/sample_data.csv',index_col=0)
foo_s=df[['food','score']]
foo_s
image.png
foo_s.loc[foo_s['food'].isin(['Steak','Lamb']),'score']=99
foo_s
E:\software\anaconda3\lib\site-packages\pandas\core\indexing.py:543: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
image.png
df
image.png

此中情况最好使用copy

food_score = df[['food','score']].copy()
food_score.loc[food_score['food'].isin(['Steak','Lamb']),'score']=100
food_score
image.png
df.query('age > 30')
image.png
df.loc[df['age']>30]
image.png
df2=df.copy()
df2=df2.rename(columns={'food':'favor food'})
df2
image.png
df2.query('favor food =="Steak"')
Traceback (most recent call last):


  File "E:\software\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


  File "<ipython-input-19-3be4a208753a>", line 1, in <module>
    df2.query('favor food =="Steak"')


  File "E:\software\anaconda3\lib\site-packages\pandas\core\frame.py", line 2847, in query
    res = self.eval(expr, **kwargs)


  File "E:\software\anaconda3\lib\site-packages\pandas\core\frame.py", line 2962, in eval
    return _eval(expr, inplace=inplace, **kwargs)


  File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\eval.py", line 291, in eval
    truediv=truediv)


  File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 739, in __init__
    self.terms = self.parse()


  File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 756, in parse
    return self._visitor.visit(self.expr)


  File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 317, in visit
    raise e


  File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 311, in visit
    node = ast.fix_missing_locations(ast.parse(clean))


  File "E:\software\anaconda3\lib\ast.py", line 35, in parse
    return compile(source, filename, mode, PyCF_ONLY_AST)


  File "<unknown>", line 1
    favor food =="Steak"
             ^
SyntaxError: invalid syntax
college = pd.read_csv('data/college.csv',index_col='INSTNM')
pd.options.display.max_columns = 100
college.head()
image.png
c_ugds=college.loc[:,'UGDS_WHITE':'UGDS_UNKN']
c_ugds.head()
image.png
race_mean=c_ugds.mean()
race_mean.head()
UGDS_WHITE    0.510207
UGDS_BLACK    0.189997
UGDS_HISP     0.161635
UGDS_ASIAN    0.033544
UGDS_AIAN     0.013813
dtype: float64
diff=c_ugds - race_mean
diff.head()
image.png
race_school=c_ugds.min(axis='columns')
race_school.head()
INSTNM
Alabama A & M University               0.0000
University of Alabama at Birmingham    0.0007
Amridge University                     0.0000
University of Alabama in Huntsville    0.0002
Alabama State University               0.0006
dtype: float64
(c_ugds - race_school).head()
image.png

看,都是Nan,需要改变axis,如下

c_ugds.sub(race_school,axis='index').head()
image.png

在需要具体制定axis时使用sub,add,gt等
使用DataFrame/Series示例

sum(college['UGDS'])
nan
college['UGDS'].sum()
16200904.0

上例中为什么一个是nan,那是因为该列中有nan值,如果没有nan值,那么结果一样如下

ugds=college['UGDS'].dropna()
ugds.head()
INSTNM
Alabama A & M University                4206.0
University of Alabama at Birmingham    11383.0
Amridge University                       291.0
University of Alabama in Huntsville     5451.0
Alabama State University                4811.0
Name: UGDS, dtype: float64
sum(ugds)
16200904.0
ugds.sum()
16200904.0
%timeit sum(ugds)
558 µs ± 54.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit ugds.sum()
285 µs ± 16.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

python内置函数与pandas内置函数,在操作dataframe,series有巨大的性能差距

%timeit -n 5 c_ugds.apply(lambda x:x.max())
5.96 ms ± 2.77 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
%timeit -n 5 c_ugds.max()
2.7 ms ± 358 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)
%timeit -n 5 c_ugds.apply(lambda x:x.max(),axis='columns')
1.46 s ± 129 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
%timeit -n 5 c_ugds.max(axis='columns')
2.87 ms ± 383 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)

后两个性能明显有巨大差异
当没有pandas内置函数能达到需求时,使用apply

earnings_debt = college[['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']]
earnings_debt.head()
image.png
earnings_debt.dtypes
MD_EARN_WNE_P10       object
GRAD_DEBT_MDN_SUPP    object
dtype: object
earnings_debt.astype('float')
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-49-66a0075d85a0> in <module>()
----> 1 earnings_debt.astype('float')


E:\software\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    176                 else:
    177                     kwargs[new_arg_name] = new_arg_value
--> 178             return func(*args, **kwargs)
    179         return wrapper
    180     return _deprecate_kwarg


E:\software\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
   4999             # else, only a single dtype is given
   5000             new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5001                                          **kwargs)
   5002             return self._constructor(new_data).__finalize__(self)
   5003 


E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
   3712 
   3713     def astype(self, dtype, **kwargs):
-> 3714         return self.apply('astype', dtype=dtype, **kwargs)
   3715 
   3716     def convert(self, **kwargs):


E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3579 
   3580             kwargs['mgr'] = self
-> 3581             applied = getattr(b, f)(**kwargs)
   3582             result_blocks = _extend_blocks(applied, result_blocks)
   3583 


E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
    573     def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
    574         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575                             **kwargs)
    576 
    577     def _astype(self, dtype, copy=False, errors='raise', values=None,


E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
    662 
    663                 # _astype_nansafe works fine with 1-d only
--> 664                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    665                 values = values.reshape(self.shape)
    666 


E:\software\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
    728 
    729     if copy:
--> 730         return arr.astype(dtype, copy=True)
    731     return arr.view(dtype)
    732 


ValueError: could not convert string to float: 'PrivacySuppressed'
pd.to_numeric(earnings_debt)
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-50-ea781e156878> in <module>()
----> 1 pd.to_numeric(earnings_debt)


E:\software\anaconda3\lib\site-packages\pandas\core\tools\numeric.py in to_numeric(arg, errors, downcast)
    118         values = np.array([arg], dtype='O')
    119     elif getattr(arg, 'ndim', 1) > 1:
--> 120         raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    121     else:
    122         values = arg


TypeError: arg must be a list, tuple, 1-d array, or Series

如上述情况,现有函数达不到数据类型转换的效果,使用apply

earnings_debt=earnings_debt.apply(pd.to_numeric,errors='coerce')
earnings_debt.head()
image.png
earnings_debt.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7535 entries, Alabama A & M University to Excel Learning Center-San Antonio South
Data columns (total 2 columns):
MD_EARN_WNE_P10       5591 non-null float64
GRAD_DEBT_MDN_SUPP    5993 non-null float64
dtypes: float64(2)
memory usage: 496.6+ KB
state=college.groupby('STABBR').agg({'SATMTMID':'max'})
state.head()
image.png
college.groupby('STABBR')['SATMTMID'].agg('max').head()
STABBR
AK    503.0
AL    590.0
AR    600.0
AS      NaN
AZ    580.0
Name: SATMTMID, dtype: float64
college.groupby('STABBR')['SATMTMID'].max().head()
STABBR
AK    503.0
AL    590.0
AR    600.0
AS      NaN
AZ    580.0
Name: SATMTMID, dtype: float64
college[['STABBR','SATMTMID']].groupby('STABBR').max().head()
image.png
col_stats = college.groupby(['STABBR', 'RELAFFIL']) \
                   .agg({'UGDS': ['min', 'max'], 
                        'SATMTMID': ['median', 'max']})
col_stats.head(10)
image.png
col_stats.columns = ['min ugds', 'max ugds', 'median satmtmid', 'max satmtmid']
col_stats = col_stats.reset_index()
col_stats.head()
image.png
def top5(s):
    s=s.sort_values(ascending=False)
    tp5_total=s.iloc[:5].sum()
    total=s.sum()
    return tp5_total/total
college.groupby('STABBR').agg({'UGDS':top5}).head(10)
image.png
%timeit college.groupby('STABBR').agg({'UGDS':top5})
99 ms ± 3.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
def top5_2(s):
    tp5_total=s.iloc[:5].sum()
    total=s.sum()
    return tp5_total/total
college.sort_values('UGDS',ascending=False).groupby('STABBR').agg({'UGDS':top5_2}).head(10)
image.png
%timeit college.sort_values('UGDS',ascending=False).groupby('STABBR').agg({'UGDS':top5_2})
53.2 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
top5=college.sort_values('UGDS',ascending=False).groupby('STABBR').head()
top5
image.png
top5_total=top5.groupby('STABBR').agg({'UGDS':'sum'})
top5_total.head()
image.png
total=college.groupby('STABBR').agg({'UGDS':'sum'})
total.head()
image.png
(top5_total / total).head()
image.png
%%timeit
college_top5 = college.sort_values('UGDS', ascending=False) \
                      .groupby('STABBR').head()
top5_total = college_top5.groupby('STABBR').agg({'UGDS': 'sum'})
total = college.groupby('STABBR').agg({'UGDS': 'sum'})
top5_total / total
16.2 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

由53.2 ms减少到了16.2 ms

movie = pd.read_csv('data/movie.csv')
movie.head()
image.png
act1 = movie.melt(id_vars=['title'], 
                  value_vars=['actor1', 'actor2', 'actor3'], 
                  var_name='actor number',
                  value_name='actor name')
act1.head()
image.png
stacked = movie.set_index('title')[['actor1', 'actor2', 'actor3']].stack()
stacked.head()
title                                           
Avatar                                    actor1         CCH Pounder
                                          actor2    Joel David Moore
                                          actor3           Wes Studi
Pirates of the Caribbean: At World's End  actor1         Johnny Depp
                                          actor2       Orlando Bloom
dtype: object
stacked.reset_index(name='actor name').head(10)
image.png
act1.pivot(index='title', columns='actor number', values='actor name').head()
image.png
stacked.unstack().head()
image.png
emp = pd.read_csv('data/employee.csv')
emp.head()
image.png
emp.pivot_table(index='RACE', columns='GENDER', values='BASE_SALARY')
image.png
race_gen_sal = emp.groupby(['RACE', 'GENDER']).agg({'BASE_SALARY': 'mean'})
race_gen_sal
image.png
race_gen_sal.unstack('GENDER')
image.png




上一篇 下一篇

猜你喜欢

热点阅读