金融时间序列处理
太忙,好久不写笔记了。这两天有空,把该整理的好好整理一下。
一、datetime库
from datetime import datetime
now = datetime.now()
print(now)
print(type(now))
2017-06-07 23:13:03.505630
<class 'datetime.datetime'>
diff = datetime(2017, 3, 4, 17) - datetime(2017, 2, 18, 15)
print(type(diff))
print(diff)
print('经历了{}天, {}秒。'.format(diff.days, diff.seconds))
<class 'datetime.timedelta'>
14 days, 2:00:00
经历了14天, 7200秒。
str >> datetime
# strptime
dt_str = '2017-02-18'
dt_obj2 = datetime.strptime(dt_str, '%Y-%m-%d')
print(type(dt_obj2))
print(dt_obj2)
<class 'datetime.datetime'>
2017-02-18 00:00:00
时间戳解析的用法:
# dateutil.parser.parse
from dateutil.parser import parse
dt_str2 = '18-02-2017'
dt_obj3 = parse(dt_str2)
print(type(dt_obj3))
print(dt_obj3)
<class 'datetime.datetime'>
2017-02-18 00:00:00
pandas的时间戳:
# pd.to_datetime
import pandas as pd
s_obj = pd.Series(['2017/02/18', '2017/02/19', '2017-02-25', '2017-02-26'], name='course_time')
print(s_obj)
0 2017/02/18
1 2017/02/19
2 2017-02-25
3 2017-02-26
Name: course_time, dtype: object
s_obj2 = pd.to_datetime(s_obj)
print(s_obj2)
0 2017-02-18
1 2017-02-19
2 2017-02-25
3 2017-02-26
Name: course_time, dtype: datetime64[ns]
# 处理缺失值
s_obj3 = pd.Series(['2017/02/18', '2017/02/19', '2017-02-25', '2017-02-26'] + [None],
name='course_time')
print(s_obj3)
0 2017/02/18
1 2017/02/19
2 2017-02-25
3 2017-02-26
4 None
Name: course_time, dtype: object
s_obj4 = pd.to_datetime(s_obj3)
print(s_obj4) # NAT-> Not a Time
0 2017-02-18
1 2017-02-19
2 2017-02-25
3 2017-02-26
4 NaT
Name: course_time, dtype: datetime64[ns]
二、Pandas时间序列
创建
from datetime import datetime
import pandas as pd
import numpy as np
# 指定index为datetime的list
date_list = [datetime(2017, 2, 18), datetime(2017, 2, 19),
datetime(2017, 2, 25), datetime(2017, 2, 26),
datetime(2017, 3, 4), datetime(2017, 3, 5)]
time_s = pd.Series(np.random.randn(6), index=date_list)
print(time_s)
print(type(time_s.index))
2017-02-18 -0.230989
2017-02-19 -0.398082
2017-02-25 -0.309926
2017-02-26 -0.179672
2017-03-04 0.942698
2017-03-05 1.053092
dtype: float64
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
# pd.date_range()
dates = pd.date_range('2017-02-18', # 起始日期
periods=5, # 周期
freq='W-SAT') # 频率
print(dates)
print(pd.Series(np.random.randn(5), index=dates))
#从2017-02-18开始每周六的数据,连续五周
DatetimeIndex(['2017-02-18', '2017-02-25', '2017-03-04', '2017-03-11',
'2017-03-18'],
dtype='datetime64[ns]', freq='W-SAT')
2017-02-18 -1.680280
2017-02-25 0.908664
2017-03-04 0.145318
2017-03-11 -2.940363
2017-03-18 0.152681
Freq: W-SAT, dtype: float64
索引
太简单就不抄文档了。
过滤
time_s
2017-02-18 -0.230989
2017-02-19 -0.398082
2017-02-25 -0.309926
2017-02-26 -0.179672
2017-03-04 0.942698
2017-03-05 1.053092
dtype: float64
time_s.truncate(before='2017-2-25')
2017-02-25 -0.309926
2017-02-26 -0.179672
2017-03-04 0.942698
2017-03-05 1.053092
dtype: float64
time_s.truncate(after='2017-2-25')
2017-02-18 -0.230989
2017-02-19 -0.398082
2017-02-25 -0.309926
dtype: float64
生成日期范围
# 传入开始、结束日期,默认生成的该时间段的时间点是按天计算的
date_index = pd.date_range('2017/02/18', '2017/03/18')
print(date_index)
DatetimeIndex(['2017-02-18', '2017-02-19', '2017-02-20', '2017-02-21',
'2017-02-22', '2017-02-23', '2017-02-24', '2017-02-25',
'2017-02-26', '2017-02-27', '2017-02-28', '2017-03-01',
'2017-03-02', '2017-03-03', '2017-03-04', '2017-03-05',
'2017-03-06', '2017-03-07', '2017-03-08', '2017-03-09',
'2017-03-10', '2017-03-11', '2017-03-12', '2017-03-13',
'2017-03-14', '2017-03-15', '2017-03-16', '2017-03-17',
'2017-03-18'],
dtype='datetime64[ns]', freq='D')
# 只传入开始或结束日期,还需要传入时间段
print(pd.date_range(start='2017/02/18', periods=10, freq='4D'))
DatetimeIndex(['2017-02-18', '2017-02-22', '2017-02-26', '2017-03-02',
'2017-03-06', '2017-03-10', '2017-03-14', '2017-03-18',
'2017-03-22', '2017-03-26'],
dtype='datetime64[ns]', freq='4D')
print(pd.date_range(end='2017/03/18', periods=10))
DatetimeIndex(['2017-03-09', '2017-03-10', '2017-03-11', '2017-03-12',
'2017-03-13', '2017-03-14', '2017-03-15', '2017-03-16',
'2017-03-17', '2017-03-18'],
dtype='datetime64[ns]', freq='D')
# 规范化时间戳
print(pd.date_range(start='2017/02/18 12:13:14', periods=10))
print(pd.date_range(start='2017/02/18 12:13:14', periods=10, normalize=True))
DatetimeIndex(['2017-02-18 12:13:14', '2017-02-19 12:13:14',
'2017-02-20 12:13:14', '2017-02-21 12:13:14',
'2017-02-22 12:13:14', '2017-02-23 12:13:14',
'2017-02-24 12:13:14', '2017-02-25 12:13:14',
'2017-02-26 12:13:14', '2017-02-27 12:13:14'],
dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2017-02-18', '2017-02-19', '2017-02-20', '2017-02-21',
'2017-02-22', '2017-02-23', '2017-02-24', '2017-02-25',
'2017-02-26', '2017-02-27'],
dtype='datetime64[ns]', freq='D')
频率与偏移量
print(pd.date_range('2017/02/18', '2017/03/18', freq='2D'))
DatetimeIndex(['2017-02-18', '2017-02-20', '2017-02-22', '2017-02-24',
'2017-02-26', '2017-02-28', '2017-03-02', '2017-03-04',
'2017-03-06', '2017-03-08', '2017-03-10', '2017-03-12',
'2017-03-14', '2017-03-16', '2017-03-18'],
dtype='datetime64[ns]', freq='2D')
# 偏移量通过加法连接
sum_offset = pd.tseries.offsets.Week(2) + pd.tseries.offsets.Hour(12)
print(sum_offset)
print(pd.date_range('2017/02/18', '2017/03/18', freq=sum_offset))
14 days 12:00:00
DatetimeIndex(['2017-02-18 00:00:00', '2017-03-04 12:00:00'], dtype='datetime64[ns]', freq='348H')
移动数据
ts = pd.Series(np.random.randn(5), index=pd.date_range('20170218', periods=5, freq='W-SAT')) print(ts)
2017-02-18 -0.208622
2017-02-25 0.616093
2017-03-04 -0.424725
2017-03-11 -0.361475
2017-03-18 0.761274
Freq: W-SAT, dtype: float64
print(ts.shift(1))
#print(ts.shift(-1))
2017-02-18 NaN
2017-02-25 -0.208622
2017-03-04 0.616093
2017-03-11 -0.424725
2017-03-18 -0.361475
Freq: W-SAT, dtype: float64
三、重采样
import pandas as pd
import numpy as np
date_rng = pd.date_range('20170101', periods=100, freq='D')
ser_obj = pd.Series(range(len(date_rng)), index=date_rng)
print(ser_obj.head(10))
2017-01-01 0
2017-01-02 1
2017-01-03 2
2017-01-04 3
2017-01-05 4
2017-01-06 5
2017-01-07 6
2017-01-08 7
2017-01-09 8
2017-01-10 9
Freq: D, dtype: int32
# 统计每个月的数据总和
resample_month_sum = ser_obj.resample('M').sum()
# 统计每个月的数据平均
resample_month_mean = ser_obj.resample('M').mean()
print('按月求和:', resample_month_sum)
print('按月求均值:', resample_month_mean)
按月求和: 2017-01-31 465
2017-02-28 1246
2017-03-31 2294
2017-04-30 945
Freq: M, dtype: int32
按月求均值: 2017-01-31 15.0
2017-02-28 44.5
2017-03-31 74.0
2017-04-30 94.5
Freq: M, dtype: float64
降采样
five_day_sum_sample = ser_obj.resample('5D').sum()
five_day_mean_sample = ser_obj.resample('5D').mean()
five_day_ohlc_sample = ser_obj.resample('5D').ohlc()
print('降采样,sum')
print(five_day_sum_sample)
降采样,sum
2017-01-01 10
2017-01-06 35
2017-01-11 60
2017-01-16 85
2017-01-21 110
2017-01-26 135
2017-01-31 160
2017-02-05 185
2017-02-10 210
2017-02-15 235
2017-02-20 260
2017-02-25 285
2017-03-02 310
2017-03-07 335
2017-03-12 360
2017-03-17 385
2017-03-22 410
2017-03-27 435
2017-04-01 460
2017-04-06 485
Freq: 5D, dtype: int32