利用 pandas 做一些简单的数据分析
import pandasas pd
import matplotlib.pyplotas plt
import numpyas np
from scipy.statsimport percentileofscore
from datetimeimport *
pd.options.display.max_columns= 777
# 通过列表 创建periodindex
data= pd.read_csv(r'pydata-notebook-master\examples\macrodata.csv')
index= pd.PeriodIndex(year = data['year'] ,quarter=data['quarter'] ,freq='Q-DEC')
data.index= index
rng= pd.date_range('2019/1/1',periods=100,freq='D')
ts= pd.Series(np.random.randn(len(rng)),index=rng)
ts= ts.resample('M',kind='period').mean()
# print(ts[:5])
''' 降采样'''
new= pd.date_range('2019/1/1',freq='M',periods=12)
ps= pd.Series(np.arange(len(new)),index=new)
pv= pd.Series(np.arange(len(new)),index=new)
# ohlc open high low close 四个值
# ps = ps.resample('5min',closed='right',label='right').ohlc()
# 分组采样
ps= ps.groupby(lambda x:x.weekday).sum()
pv= pv.groupby(lambda x:x.month).sum()
# 升采样
frame= pd.DataFrame(np.random.randn(2,4),index=pd.date_range('2019/3/1',freq='W-WED',periods=2))
fr= frame.resample('D',limit=2).ffill()
tr= frame.resample('w-thu').sum()
# 通过时期进行重采样
ftime= pd.DataFrame(np.random.randn(24,4),
index=pd.period_range('2018/1','2019/12',freq='M'),
columns=['Colorado','Texas','New York','Ohio']
)
ann_ftime= ftime.resample('A-DEC').mean()
q_ftime= ann_ftime.resample('M').mean().ffill()
print('--------------------------------')
close_px_all= pd.read_csv(r'pydata-notebook-master\examples\stock_px.csv',index_col=0,parse_dates=True)
close_px= close_px_all[['AAPL','MSFT','XOM']]
# close_px.index = close_px_all.index
cls= close_px.resample('B').ffill()
# cls.ix['1990'].plot(color='orange',title='Yahoo Finance',alpha=0.5)
# cls[['AAPL','MSFT']].ix['1990':'1997'].plot(color='red',alpha=0.6)
# aapl_px = close_px['AAPL'].resample('Q-DEC').ffill()
# aapl_px.ix['1990':'2000'].plot(color='blue',alpha=0.7)
# pd.rolling(close_px['AAPL'],250).plot()
# close_px['AAPL'].ix['2000'].rolling(3).mean().plot()
# close_px.rolling(40).mean().plot(logy=True)
# 指数加权运算
fig,axes = plt.subplots(2,1,figsize=(12,7))
aapl_px = close_px['AAPL'].ix['2000':'2005']
ma60 = aapl_px.rolling(60,min_periods=50).mean()
ewma60 = pd.DataFrame.ewm(aapl_px,span=60).mean()
# print(ewma60[:3])
aapl_px.plot(style='-',ax=axes[0])
ma60.plot(style='--',ax=axes[0])
aapl_px.plot(style='-',ax=axes[1])
ewma60.plot(style='--',ax=axes[1])
axes[0].set_title('Simaple MA')
axes[1].set_title('Exponentially-wiighted MA')
spx= close_px_all['SPX']
spx_rets= spx/ spx.shift(1)-1
returns= close_px.pct_change()
# min_periods 周期值
corr= returns['AAPL'].rolling(125,min_periods=100).corr(spx_rets)
# 多列数据之间的比较
spx_corr= returns.rolling(125,min_periods=100).corr(spx_rets)
# corr.plot()
# spx_corr.plot()
'''
# 百分之2的回报率的百分等级score_at = lambda x:percentileofscore(x,0.02)
result = returns['AAPL'].rolling(250).apply(score_at)
'''
# result.plot()
plt.xlabel('Time Line')
plt.xticks(rotation=90,fontsize=8)
plt.ylabel('Number Information')
# plt.title('Yahoo Finance')
plt.tight_layout()
frame= pd.DataFrame(np.random.randn(5,4),index=pd.date_range('2019/4/1',freq='D',periods=5))
f1= pd.DataFrame(frame[0],index=frame.index)
f2= pd.DataFrame(frame[1],index=frame.index.shift(1))
f3= f2.reindex(f1.index ,method='ffill')
infl= pd.date_range('2019/1/1 09:30' ,'2019/1/1 15:50' ,freq='T')
rng= infl.append([infl+ pd.offsets.BDay(i)for iin range(1,4)])
tsrng= pd.Series(np.arange(len(rng)),index=rng)
# print(tsrng[time(10,0)])
indexer= np.sort(np.random.permutation(len(tsrng))[700:])
irr_ts= tsrng.copy()
irr_ts[indexer]= np.nan
print(irr_ts.at_time(time(10,0)))
select_time= pd.date_range('2019/1/1 10:00',freq='B',periods=4)
irr_ts= irr_ts.asof(select_time)
print(irr_ts[time(10,0)])
print(np.ones((3,4))*2)
#收益指数和累计收益的计算
price= web.get_data_yahoo('AAPL','2018-01-01')['Adj Close']
# 两个时间点的累计百分比回报
price_adj= price['2018-10-01']/ price['2018-03-01']- 1
# 计算收益指数 方法一 获取当前数据的前后的百分比
returns= price.pct_change()
# 计算收益指数cumprod
re_index= (1 + returns).cumprod()
re_index[0]= 1
# 计算出某段时间的收益指数
m_returns= re_index.resample('BM',how='last').pct_change()
# 方法二
m_ret= (1 + returns).resample('M',how='prod',kind='period')- 1
print(m_ret)
# 数据分组分析
N= 1000
# 创建一个随机大写字符串
def rand(n):
choices= string.ascii_uppercase
jionres= ''.join([random.choice(choices)for _in range(n)])
return jionres
# 生成N组 随机串
tickers= np.array([rand(5)for _in range(N)])
M= 500
df= pd.DataFrame({'Moement':np.random.randn(M)/ 200 - 0.2,
'Value':np.random.randn(M)/ 200 + 0.21,
'Short':np.random.randn(M)/ 200 - 0.12,
},index=tickers[:M])
ids_names= np.array(['FINANCIAL','TECH'])
# 生成N组 随机数字
sampler= np.random.randint(0,len(ids_names),N)
# 创建一个匹配 tickers 的series
industries= pd.Series(ids_names[sampler],index=tickers,name='industry')
# 分组 方法一
# df['bygroup'] = industries
# by_indus = df.groupby('bygroup')
# 分组
by_indus= df.groupby(industries)
def zscore(group):
return (group - group.mean())/ group.std()
# zscore = by_indus.apply(zscore)
# zs_group = zscore.groupby(industries).agg(['mean','std'])
# 通过 rank 函数进行平均排名 然后进行聚合运算
id_rank= by_indus.rank(ascending=False)
rank_group= id_rank.groupby(industries).agg(['min','max'])
ran_std= by_indus.apply(lambda x:zscore(x.rank()))
print(ran_std)
这里是需要用到的数据分析的数据文件:
链接:https://pan.baidu.com/s/11ryc7VoD3VbtDMl6-kkvig
提取码:1n0e