作业-用t分布求置信区间
2017-08-14 本文已影响148人
pnjoe
本课作业
- 基本作业
- 用 t 分布 求房屋平均面积在95%的置信区间, 数据为 house_size.csv
import scipy.stats
import numpy as np
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
data = pd.read_csv('house_size.csv', header=None)
house = data.iloc[:,0]
知识点:直方图的bins设多少合适呢?
维基百科Freedman–Diaconis rule帮你解答
IQR = house.quantile(0.75) - house.quantile(0.25) bin_size = 2 * IQR / len(house)**(1.0/3)
# 来张图看下拿到的这份数据是个什么样的.与解答题无关.
plt.rcParams['font.sans-serif']=['SimHei']
plt.hist(house,bins = round(bin_size),rwidth=0.9)
plt.vlines(house.mean(), 0,12,colors = "r", linestyles = "dashed",label="平均值"+str(house.mean()))
plt.title('100个房屋面积样本分布 直方图')
plt.xlabel('房屋面积平方数㎡')
plt.legend()
plt.show()
解法一:传统公式解法
知识点:t分布 置信区间公式
sample_std = house.std()
sample_mean = house.mean()
sample_size = len(house)
alpha = 1 - 0.95
t_score = scipy.stats.t.isf(alpha / 2,df=(sample_size-1))
ME = t_score * sample_std / np.sqrt(sample_size)
lower_limit = sample_mean - ME
upper_limit = sample_mean + ME
print('95%% Confidence Interval: ( %.2f, %.2f)' % (lower_limit, upper_limit))
# 上面一段代码运行结果
95% Confidence Interval: ( 283.20, 318.50)
解法二:用bootstrap模拟3万个样本,生成抽本分布
def bootstrap(data,times=1):
# 利用bootstrap方法生成多个样本.分别求样本的平均值.
# 返回 将times个样本的平均值做成的数组.(可用于做抽样分布)
# 初始化长度为times的空数组
Samples_mean = np.empty(times)
#进行多次(times次)抽样,将每次得到的样本均值存储在Samples_mean列表里
for i in range(times):
# 随机从数据data中重复抽样,样本大小与data相同,并返回样本均值
Random_Sample = np.random.choice(data, size=len(data))
Sample_mean = np.mean(Random_Sample)
Samples_mean[i] = Sample_mean # 利用循环语句,将每个样本均值存入Samples_mean
S_mean = np.mean(Samples_mean)
S_std = np.std(Samples_mean)
return Samples_mean,S_mean,S_std
# 画个图
plt.rcParams['font.sans-serif']=['SimHei']
plt.figure(figsize=(8,5))
Samples_mean,S_mean ,S_std= bootstrap(house,30000)
plt.hist(Samples_mean,bins=43, normed=True,rwidth=0.9)
norm = scipy.stats.norm(S_mean, S_std)
x = np.arange(S_mean - S_std * 3.5, S_mean + S_std * 3.5, 1)
y = norm.pdf(x)
plt.plot(x, y,'r--',label='standard normal',alpha=0.7)
plt.title('用bootstrap生成3万样本,每个样本大小为100.做成的抽样分布图',fontsize=16)
plt.xlabel('房屋面积平方数㎡',fontsize=14)
plt.legend()
plt.show()
np.percentile(Samples_mean,[2.5,97.5])
# 运行结果
array([ 283.63975, 318.21025])
与上面传统用公式解法.答案非常接近.