pandas使用习惯示例总结续
2019-01-18 本文已影响115人
筝韵徽
import pandas as pd
import numpy as np
pandas使用习惯总结续
df = pd.read_csv('data/sample_data.csv',index_col=0)
foo_s=df[['food','score']]
foo_s
image.png
- 修改food列为steak,lamb的值
foo_s.loc[foo_s['food'].isin(['Steak','Lamb']),'score']=99
foo_s
E:\software\anaconda3\lib\site-packages\pandas\core\indexing.py:543: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self.obj[item] = s
image.png
df
image.png
此中情况最好使用copy
food_score = df[['food','score']].copy()
food_score.loc[food_score['food'].isin(['Steak','Lamb']),'score']=100
food_score
image.png
- .ix都用。iloc,.loc代替
- .query 当clomun的那么有空格时,不能运行
df.query('age > 30')
image.png
df.loc[df['age']>30]
image.png
df2=df.copy()
df2=df2.rename(columns={'food':'favor food'})
df2
image.png
df2.query('favor food =="Steak"')
Traceback (most recent call last):
File "E:\software\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-19-3be4a208753a>", line 1, in <module>
df2.query('favor food =="Steak"')
File "E:\software\anaconda3\lib\site-packages\pandas\core\frame.py", line 2847, in query
res = self.eval(expr, **kwargs)
File "E:\software\anaconda3\lib\site-packages\pandas\core\frame.py", line 2962, in eval
return _eval(expr, inplace=inplace, **kwargs)
File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\eval.py", line 291, in eval
truediv=truediv)
File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 739, in __init__
self.terms = self.parse()
File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 756, in parse
return self._visitor.visit(self.expr)
File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 317, in visit
raise e
File "E:\software\anaconda3\lib\site-packages\pandas\core\computation\expr.py", line 311, in visit
node = ast.fix_missing_locations(ast.parse(clean))
File "E:\software\anaconda3\lib\ast.py", line 35, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
File "<unknown>", line 1
favor food =="Steak"
^
SyntaxError: invalid syntax
- 使用+, -, *, /, <, >, <=, >=, ==, != 减少使用dd, sub, mul, div, lt, gt, le, ge, eq, ne,除非要更改默认的axis
college = pd.read_csv('data/college.csv',index_col='INSTNM')
pd.options.display.max_columns = 100
college.head()
image.png
c_ugds=college.loc[:,'UGDS_WHITE':'UGDS_UNKN']
c_ugds.head()
image.png
race_mean=c_ugds.mean()
race_mean.head()
UGDS_WHITE 0.510207
UGDS_BLACK 0.189997
UGDS_HISP 0.161635
UGDS_ASIAN 0.033544
UGDS_AIAN 0.013813
dtype: float64
diff=c_ugds - race_mean
diff.head()
image.png
race_school=c_ugds.min(axis='columns')
race_school.head()
INSTNM
Alabama A & M University 0.0000
University of Alabama at Birmingham 0.0007
Amridge University 0.0000
University of Alabama in Huntsville 0.0002
Alabama State University 0.0006
dtype: float64
(c_ugds - race_school).head()
image.png
看,都是Nan,需要改变axis,如下
c_ugds.sub(race_school,axis='index').head()
image.png
在需要具体制定axis时使用sub,add,gt等
使用DataFrame/Series示例
sum(college['UGDS'])
nan
college['UGDS'].sum()
16200904.0
上例中为什么一个是nan,那是因为该列中有nan值,如果没有nan值,那么结果一样如下
ugds=college['UGDS'].dropna()
ugds.head()
INSTNM
Alabama A & M University 4206.0
University of Alabama at Birmingham 11383.0
Amridge University 291.0
University of Alabama in Huntsville 5451.0
Alabama State University 4811.0
Name: UGDS, dtype: float64
sum(ugds)
16200904.0
ugds.sum()
16200904.0
%timeit sum(ugds)
558 µs ± 54.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit ugds.sum()
285 µs ± 16.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
python内置函数与pandas内置函数,在操作dataframe,series有巨大的性能差距
- apply示例性能对比
%timeit -n 5 c_ugds.apply(lambda x:x.max())
5.96 ms ± 2.77 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
%timeit -n 5 c_ugds.max()
2.7 ms ± 358 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)
%timeit -n 5 c_ugds.apply(lambda x:x.max(),axis='columns')
1.46 s ± 129 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
%timeit -n 5 c_ugds.max(axis='columns')
2.87 ms ± 383 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)
后两个性能明显有巨大差异
当没有pandas内置函数能达到需求时,使用apply
earnings_debt = college[['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']]
earnings_debt.head()
image.png
earnings_debt.dtypes
MD_EARN_WNE_P10 object
GRAD_DEBT_MDN_SUPP object
dtype: object
earnings_debt.astype('float')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-66a0075d85a0> in <module>()
----> 1 earnings_debt.astype('float')
E:\software\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
176 else:
177 kwargs[new_arg_name] = new_arg_value
--> 178 return func(*args, **kwargs)
179 return wrapper
180 return _deprecate_kwarg
E:\software\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4999 # else, only a single dtype is given
5000 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5001 **kwargs)
5002 return self._constructor(new_data).__finalize__(self)
5003
E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3712
3713 def astype(self, dtype, **kwargs):
-> 3714 return self.apply('astype', dtype=dtype, **kwargs)
3715
3716 def convert(self, **kwargs):
E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3579
3580 kwargs['mgr'] = self
-> 3581 applied = getattr(b, f)(**kwargs)
3582 result_blocks = _extend_blocks(applied, result_blocks)
3583
E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
573 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
574 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575 **kwargs)
576
577 def _astype(self, dtype, copy=False, errors='raise', values=None,
E:\software\anaconda3\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
662
663 # _astype_nansafe works fine with 1-d only
--> 664 values = astype_nansafe(values.ravel(), dtype, copy=True)
665 values = values.reshape(self.shape)
666
E:\software\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
728
729 if copy:
--> 730 return arr.astype(dtype, copy=True)
731 return arr.view(dtype)
732
ValueError: could not convert string to float: 'PrivacySuppressed'
pd.to_numeric(earnings_debt)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-50-ea781e156878> in <module>()
----> 1 pd.to_numeric(earnings_debt)
E:\software\anaconda3\lib\site-packages\pandas\core\tools\numeric.py in to_numeric(arg, errors, downcast)
118 values = np.array([arg], dtype='O')
119 elif getattr(arg, 'ndim', 1) > 1:
--> 120 raise TypeError('arg must be a list, tuple, 1-d array, or Series')
121 else:
122 values = arg
TypeError: arg must be a list, tuple, 1-d array, or Series
如上述情况,现有函数达不到数据类型转换的效果,使用apply
earnings_debt=earnings_debt.apply(pd.to_numeric,errors='coerce')
earnings_debt.head()
image.png
earnings_debt.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7535 entries, Alabama A & M University to Excel Learning Center-San Antonio South
Data columns (total 2 columns):
MD_EARN_WNE_P10 5591 non-null float64
GRAD_DEBT_MDN_SUPP 5993 non-null float64
dtypes: float64(2)
memory usage: 496.6+ KB
- group & agg示例用法
state=college.groupby('STABBR').agg({'SATMTMID':'max'})
state.head()
image.png
college.groupby('STABBR')['SATMTMID'].agg('max').head()
STABBR
AK 503.0
AL 590.0
AR 600.0
AS NaN
AZ 580.0
Name: SATMTMID, dtype: float64
college.groupby('STABBR')['SATMTMID'].max().head()
STABBR
AK 503.0
AL 590.0
AR 600.0
AS NaN
AZ 580.0
Name: SATMTMID, dtype: float64
college[['STABBR','SATMTMID']].groupby('STABBR').max().head()
image.png
- 多层次索引,搞成单层次,二维表的形式处理,更便捷
col_stats = college.groupby(['STABBR', 'RELAFFIL']) \
.agg({'UGDS': ['min', 'max'],
'SATMTMID': ['median', 'max']})
col_stats.head(10)
image.png
col_stats.columns = ['min ugds', 'max ugds', 'median satmtmid', 'max satmtmid']
col_stats = col_stats.reset_index()
col_stats.head()
image.png
- 在groupby中使用apply会有重大性能缺失,小心,示例
def top5(s):
s=s.sort_values(ascending=False)
tp5_total=s.iloc[:5].sum()
total=s.sum()
return tp5_total/total
college.groupby('STABBR').agg({'UGDS':top5}).head(10)
image.png
%timeit college.groupby('STABBR').agg({'UGDS':top5})
99 ms ± 3.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
- 性能优化 如下不在top5函数中排序
def top5_2(s):
tp5_total=s.iloc[:5].sum()
total=s.sum()
return tp5_total/total
college.sort_values('UGDS',ascending=False).groupby('STABBR').agg({'UGDS':top5_2}).head(10)
image.png
%timeit college.sort_values('UGDS',ascending=False).groupby('STABBR').agg({'UGDS':top5_2})
53.2 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
- 使用pandas内置函数优化 如下
top5=college.sort_values('UGDS',ascending=False).groupby('STABBR').head()
top5
image.png
top5_total=top5.groupby('STABBR').agg({'UGDS':'sum'})
top5_total.head()
image.png
total=college.groupby('STABBR').agg({'UGDS':'sum'})
total.head()
image.png
(top5_total / total).head()
image.png
%%timeit
college_top5 = college.sort_values('UGDS', ascending=False) \
.groupby('STABBR').head()
top5_total = college_top5.groupby('STABBR').agg({'UGDS': 'sum'})
total = college.groupby('STABBR').agg({'UGDS': 'sum'})
top5_total / total
16.2 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
由53.2 ms减少到了16.2 ms
- melt vs stack 示例
movie = pd.read_csv('data/movie.csv')
movie.head()
image.png
act1 = movie.melt(id_vars=['title'],
value_vars=['actor1', 'actor2', 'actor3'],
var_name='actor number',
value_name='actor name')
act1.head()
image.png
stacked = movie.set_index('title')[['actor1', 'actor2', 'actor3']].stack()
stacked.head()
title
Avatar actor1 CCH Pounder
actor2 Joel David Moore
actor3 Wes Studi
Pirates of the Caribbean: At World's End actor1 Johnny Depp
actor2 Orlando Bloom
dtype: object
stacked.reset_index(name='actor name').head(10)
image.png
act1.pivot(index='title', columns='actor number', values='actor name').head()
image.png
stacked.unstack().head()
image.png
- pivot_table vs groupby then unstack
emp = pd.read_csv('data/employee.csv')
emp.head()
image.png
emp.pivot_table(index='RACE', columns='GENDER', values='BASE_SALARY')
image.png
race_gen_sal = emp.groupby(['RACE', 'GENDER']).agg({'BASE_SALARY': 'mean'})
race_gen_sal
image.png
race_gen_sal.unstack('GENDER')
image.png