藐姑射之山

大师兄的Python机器学习笔记:统计学基础之底层代码实现(一)

2020-01-20  本文已影响0人  superkmi

大师兄的Python机器学习笔记:统计学基础之底层代码实现(二)

一、中心趋势度量(Measure of Central Tendency)

1.众数(mode)
>>>def calculate_mode(data):
>>>    # 返回众数的list
>>>    data_set = set(data)
>>>    frequency_of_data = {}
>>>    for item in data_set:
>>>        frequency_of_data[item] = data.count(item)
>>>    max_frequency = max(frequency_of_data)
>>>    result = [k for k,v in frequency_of_data.items() if v == max_frequency]
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>    print('众数:',calculate_mode(test_data))
众数: [1]
>>>import numpy as numpy
>>>def descriptive_mode_numpy(list):
>>>    # [第1步] 获取 所有不重复的变量值 在 变量值列表 中的 出现频数
>>>    frequency_dict=numpy.bincount(list)
>>>    # [第2步] 获取 出现频率 最高的变量值
>>>    return numpy.argmax(frequency_dict)
>>>if __name__ == '__main__':
>>>    test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>    print('众数:',descriptive_mode_numpy(test_data))
众数: 1
2.中位数(Median)
>>>def calculate_median(data):
>>>    # 返回中位数
>>>    length_of_data = len(list(data))
>>>    data.sort()
>>>    half_of_length = length_of_data // 2
>>>    if (length_of_data % 2) == 1:
>>>        result = data[half_of_length]
>>>    else:
>>>        result = (data[half_of_length] + x[half_of_length-1])/2
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>    print('中位数:',calculate_median(test_data))
中位数: 2
>>>import numpy as numpy
>>>test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>median = numpy.median(test_data)
>>>print('中位数:',median)
中位数: 2.0
3.分位数(Quantile)
>>>def calculate_quantile(data):
>>>    # 计算四分位数
>>>    data = sorted(data)
>>>    length_of_data = len(data)
>>>    quantile_of_length,rem = divmod(length_of_data,2) # 返回商和余数
>>>    if rem:
>>>        result = data[:quantile_of_length],data[quantile_of_length+1:],data[quantile_of_length]
>>>    else:
>>>        result = data[:quantile_of_length],data[quantile_of_length:],(data[quantile_of_length->>>1]+data[quantile_of_length])/2
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>    l_half,r_half,q2 = calculate_quantile(test_data)
>>>    quantile_l = calculate_quantile(l_half)[2]
>>>    quantile_h = calculate_quantile(r_half)[2]
>>>    print('下四分位数:',quantile_l)
>>>    print('上四分位数:',quantile_h)
下四分位数: 1
上四分位数: 3
>>>import numpy as np
>>>test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>quantile = np.percentile(test_data,(25,75),interpolation='midpoint')
>>>print('下四分位数:', quantile[0])
>>>print('上四分位数:', quantile[1])
下四分位数: 1.0
上四分位数: 3.0
4.平均数

1)简单平均数(mean)

  • 公式:\overline{x}=\frac{\sum_{i=1}^nx_i}{n}
  • 纯python代码实现:
>>>def calculate_mean(data):
>>>    # 计算平均数
>>>    sum = 0
>>>    for item in data:
>>>        sum += float(item)
>>>    result = sum/(len(data))
>>>    return result
>>>if __name__ == '__main__':
>>>   test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>   print('平均数:',calculate_mean(test_data))
平均数: 2.3333333333333335
  • 使用numpy包:
>>>import numpy as np
>>>test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>mean = np.mean(test_data)
>>>print('平均数:', mean)
平均数: 2.3333333333333335

2)加权平均数(Weighted mean)

  • 加权平均数中每个点对于平均数的贡献并不是相等的,有些点要比其他的点更加重要。
  • 公式:\overline{x}=\frac{\sum_{i=1}^nw_ix_i}{\sum_{i=1}^nw_i}
  • w:权重
  • 纯python代码实现:
>>>def calculate_weighted_mean(data):
>>>    data_set = set(data)
>>>    t = list(range(len(data)))
>>>    t_sum = 0
>>>    for i in range(len(t)):
>>>        t_sum += i
>>>    result = 0
>>>    for i in range(len(data)):
>>>        result += data[i]*t[i]/t_sum
>>>    return result
>>> if __name__ == '__main__':
>>>     test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>     print('加权平均数:',calculate_weighted_mean(test_data))
加权平均数: 2.4095238095238094
  • 使用numpy包:
>>>import numpy as np
>>>def calculate_weighted_mean_np(data):
>>>    t = np.arange(len(data)) 
>>>    result = np.average(data,weights=t)
>>>    return result
>>>if __name__ == '__main__':
>>>   test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>   print('加权平均数:', calculate_weighted_mean_np(test_data))
加权平均数: 2.4095238095238094

3)几何平均数(Geometric mean)

  • 几何平均数通过值的乘积来指示一组数字的集中趋势或典型值。
  • 公式:\overline{G}=\sqrt[n]{x_1x_2...x_n}
  • 纯python代码实现:
>>>def calculate_geometric_mean(data):
>>>    product = 1
>>>    for item in data:
>>>        product = product*float(item)
>>>    result = product**(1/len(data))
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1,2,3,4,1,2,3,1,2,1,5,6,1,1,2]
>>>    print('几何平均数:',calculate_geometric_mean(test_data))
几何平均数: 1.916473929999829
  • 使用numpy包:
>>>import numpy as np
>>>geometric_mean = np.power(np.prod(test_data),1/len(test_data))
>>>print('几何平均数:', geometric_mean)
几何平均数: 1.916473929999829

二、离散程度度量(Measure of dispersion)

1.异众比率(variation ratio)
>>>import numpy as np
>>>def calculate_frequency_of_mode(data):
>>>    frequency_dict = np.bincount(data)
>>>    return frequency_dict[np.argmax(frequency_dict)]

>>>def calculate_variation_ratio(data):
>>>    # 计算众数的频数
>>>    frequency_of_mode = calculate_frequency_of_mode(data)
>>>    # 计算异众比率
>>>    result = 1-(frequency_of_mode)/len(data)
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print(‘异众比率:’,calculate_variation_ratio(test_data))
异众比率: 0.6
2.平均绝对偏差(Mean Absolute Deviation)
>>>import numpy as np
>>>def calculate_mean_absolute_deviation(data):
>>>    # 求平均值
>>>    mean = np.mean(data)
>>>    # 求平均差
>>>    result = sum([abs(x - mean) for x in data])/len(data)
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('平均绝对偏差:',calculate_mean_absolute_deviation(test_data))
平均绝对偏差: 1.2444444444444442
3.方差(Variance)
>>>import numpy as np
>>>def calculate_variance(data):
>>>    # 求平均值
>>>    mean = np.mean(data)
>>>    # 求方差
>>>    result = sum([(x - mean)**2 for x in data])/len(data)
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('方差:',calculate_variance(test_data))
方差: 2.3555555555555556
>>>import numpy as np
>>>test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>variance = np.var(test_data)
>>>print('方差:',variance)
方差: 2.3555555555555556
4.标准差(Standard Deviation)
>>>import numpy as np
>>>def calculate_variance(data):
>>>    # 求平均值
>>>    mean = np.mean(data)
>>>    # 求方差
>>>    result = sum([(x - mean)**2 for x in data])/len(data)
>>>    return result

>>>def calculate_standard_deviation(data):
>>>    # 求平均值
>>>    mean = np.mean(data)
>>>    # 求方差
>>>    variance = sum([(x - mean) ** 2 for x in data]) / len(data)
>>>    # 求平均差
>>>    result = variance**(1/2)
>>>    return result

>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('标准差:',calculate_standard_deviation(test_data))
标准差: 1.5347819244295118
>>>import numpy as np
>>>test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>standard_deviation = np.std(test_data)
>>>print('标准差:',standard_deviation)
标准差: 1.5347819244295118
5.标准分数(z-score)
>>>import numpy as np
>>>def calculate_zscore(x,data):
>>>    # 求平均值
>>>    mean = np.mean(data)
>>>    # 求标准差
>>>    std = np.std(data)
>>>    # 计算z-score
>>>    result = (x-mean)/std
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('标准分数:',calculate_zscore(test_data[0],test_data))
标准分数: -0.8687444855261388
6.四分位距(interquartile range)
>>>import numpy as np
>>>def calculate_QPR(data):
>>>    # 获取上下四分位数
>>>    Q_L = np.quantile(data,0.25,interpolation='lower')
>>>    Q_U = np.quantile(data,0.75,interpolation='higher')
>>>    result = Q_U - Q_L
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('四分位距:',calculate_QPR(test_data))
四分位距: 2
7.离散系数(coefficient of variation)
>>>import numpy as np
>>>def calculate_coefficient_of_variation(data):
>>>    # 计算平均差
>>>    std = np.std(data)
>>>    # 计算平均值
>>>    mean = np.mean(data)
>>>    # 计算离散系数
>>>    result = std/abs(mean)
>>>    return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('离散系数:',calculate_coefficient_of_variation(test_data))
离散系数: 0.6577636818983621

三、数据分布的形状(Shape of dispersion)

1.偏态系数(skewness)
>>>import numpy as np
>>>def calculate_skewness(data):
>>>    l = len(data)
>>>    # 计算平均值
>>>    mean = np.mean(data)
>>>    # 计算三阶中心距
>>>    mu_3 = sum([x**3 for x in data])/l
>>>    # 计算标准差
>>>    std = np.std(data)
>>>    # 计算偏态
>>>    result = (mu_3 - 3*mean*std**2-mean**3)/std**3
 >>>   return result
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print('偏态系数:',calculate_skewness(test_data))
偏态系数: 1.0900284582544935
>>>import pandas as pd
>>>test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>test_data = pd.Series(test_data)
>>>skewness = test_data.skew()
>>>print('偏态系数:',skewness)
偏态系数: 1.2150779271256849
2.峰态系数(kurtosis)
def calculate_kurtosis(data):
    l = len(data)
    # 计算平均值
    mean = np.mean(data)
    # 计算标准差
    std = np.std(data)
    # 计算峰态

    v4 = sum([((x - mean) ** 4)/l for x in data])
    result = v4/(std**4)
    return  result

if __name__ == '__main__':
    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
    print('峰态系数:',calculate_kurtosis(test_data))
峰态系数: 3.105197579209683
>>>import pandas as pd
>>>test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>test_data = pd.Series(test_data)
>>>kurtosis = test_data.kurt()
>>>print(kurtosis)
0.6895144727113385

四、数据分布特征(Data distribution characteristics)

1.切比雪夫法则(Chebyshev's Inequality)
>>>import numpy as np
>>>def calculate_range(mean,std,k):
>>>    range = (int(mean - k*std),int(mean + k*std))
>>>    return range
>
>>>def calculate_data_distribution_characteristics(data):
>>>    std = np.std(data)
>>>    mean = np.mean(data)
>>>    d1 = dict()
>>>    for item in data:
>>>        k = 0
>
>>>        while True:
>>>            r1,r2 = calculate_range(mean, std, k)
>>>            if item in range(r1,r2) :
>>>                if k not in d1:
>>>                    d1[k] = 0
>>>                d1[k] += 1
>>>                break
>>>            k += 1
>
>>>    result = {}
>>>    for k,v in d1.items():
>>>        n = k
>>>        while True:
>>>            if n == 0:
>>>                break
>>>            try:
>>>                v += d1[n-1]
>>>                n-=1
>>>            except KeyError as e:
>>>                n-=1
>>>        result[k] = '{}标准差内:{:.0%}'.format(k, float(v / len(data)))
>>>    return result
>
>>>if __name__ == '__main__':
>>>    test_data = [1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 5, 6, 1, 1, 2]
>>>    print(calculate_data_distribution_characteristics(test_data))
{1: '1标准差内:67%', 2: '2标准差内:87%', 3: '3标准差内:93%', 4: '4标准差内:100%'}
2.经验法则(rule of thumb)
>>>import numpy as np
>>>def calculate_range(mean,std,k):
>>>    range = (int(mean - k*std),int(mean + k*std))
>>>    return range
>
>>>def calculate_data_distribution_characteristics(data):
>>>    std = np.std(data)
>>>    mean = np.mean(data)
>>>    d1 = dict()
>>>    for item in data:
>>>        k = 0
>
>>>        while True:
>>>            r1,r2 = calculate_range(mean, std, k)
>>>            if item in range(r1,r2) :
>>>                if k not in d1:
>>>                    d1[k] = 0
>>>                d1[k] += 1
>>>                break
>>>            k += 1
>
>>>    result = {}
>>>    for k,v in d1.items():
>>>        n = k
>>>        while True:
 >>>           if n == 0:
>>>                break
>>>            try:
>>>                v += d1[n-1]
>>>                n-=1
>>>            except KeyError as e:
>>>                n-=1
>>>        result[k] = '{}标准差内:{:.0%}'.format(k, float(v / len(data)))
>>>    return result
>
>>>if __name__ == '__main__':
>>>    test_data_e = [1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,5,5]
>>>    print(calculate_data_distribution_characteristics(test_data_e))
{1: '1标准差内:60%', 2: '2标准差内:80%', 3: '3标准差内:100%'}

参考资料



本文作者:大师兄(superkmi)

上一篇下一篇

猜你喜欢

热点阅读