Python实现各类距离

2019-05-03  本文已影响0人  山雾幻华
import numpy as np
import operator
import scipy.spatial.distance as dist  
def pp_ps(inX, dataSet,function):
    # 将点与点的距离写为点与数据集的计算,返回一维数据
    distances = np.array([function(inX, dataSet[i]) for i in range(group.shape[0])])
    return distances

闵可夫斯基距离(Minkowski Distance)

\mathrm{L}_p(\mathbf{x}_i, \mathbf{x}_j)=\sqrt[p]{\sum_{k=1}^{n}\left(x_{i}^{(l)}-x_{j}^{(l)}\right)^{p}}

np.linalg.norm
def Minkowski_distance_1(vector1, vector2, p):
    return pdist(np.vstack((vector1, vector2)), 'minkowski', p)
20190503191108.png

欧式距离(Euclidean Distance)

\mathrm{L}_2(\mathbf{x}_i, \mathbf{x}_j) = \sqrt{\sum_{k=1}^{n}\left(x_{i}^{(l)}-x_{j}^{(l)}\right)^{2}}

def euclidean_distance_1(inX, dataSet):
    # 点与样本集的欧式距离
    sub = inX - dataSet
    squ_sub = sub**2
    sum_squ_sub = np.sum(squ_sub, axis=1)
    distances = sum_squ_sub**0.5
    return distances
def euclidean_distance_2(inX, dataSet):
    # 点与样本集的欧式距离
    return np.linalg.norm(inX - dataSet,axis=1)
def euclidean_distance_3(vector1, vector2):
    # 点与点的欧式距离
    return pdist(np.vstack((vector1, vector2)), 'euclidean')

标准欧式距离(Standardized Euclidean Distance)

\mathrm{L}_2(\mathbf{x}_i, \mathbf{x}_j) = \sqrt{\sum_{k=1}^{n}\left(x_{i}^{(l)}-x_{j}^{(l)}\right)^{2}}

def euclidean_distance_1(vector1, vector2):
    # 点与点的标准欧式距离,v是方差向量,表示 v[i]表示第i个分量的方差,如果缺失。默认自动计算。
    return pdist(X, 'seuclidean', V=None)

曼哈顿距离(Manhattan Distance)

\mathrm{L}_{1}(\mathbf{x}_i, \mathbf{x}_j)=\sum_{l=1}^{n}\left|x_{i}^{(l)}-x_{j}^{(l)}\right|

def manhattan_distance_1(inX, dataSet):
    # 点与样本集的曼哈顿距离
    sub = inX - dataSet
    abs_sub = np.abs(sub)
    distances = np.sum(abs_sub, axis=1)
    return distances
def manhattan_distance_2(inX, dataSet):
    # 点与样本集的曼哈顿距离
    return np.linalg.norm(inX - dataSet,axis=1,ord=1)
def manhattan_distance_3(vector1, vector2):
    # 点与点的曼哈顿距离
    return pdist(np.vstack((vector1, vector2)), 'cityblock')

切比雪夫距离(Chebyshev Distance)

\mathrm{L}_{\infty}(\mathbf{x}_i, \mathbf{x}_j)=\max _{l}\left|x_{i}^{(l)}-x_{j}^{(l)}\right|

def chebyshev_distance_1(inX, dataSet):
    # 点与样本集的切比雪夫距离
    sub = inX - dataSet
    abs_sub = np.abs(sub)
    distances = np.max(abs_sub, axis=1)
    return distances
def chebyshev_distance_2(inX, dataSet):
    # 点与样本集的切比雪夫距离
    return np.linalg.norm(inX - dataSet,axis=1,ord=np.inff)
def chebyshev_distance_3(vector1, vector2):
    # 点与点的切比雪夫距离
    return pdist(np.vstack((vector1, vector2)), 'chebyshev')

马氏距离(Mahalanobis Distance)

#方法一:根据公式求解
def Mahalanobis_distance_1(x,y):
    X=np.vstack([x,y])
    XT=X.T
    S=np.cov(X)   #两个维度之间协方差矩阵
    SI = np.linalg.inv(S) #协方差矩阵的逆矩阵
    #马氏距离计算两个样本之间的距离,此处共有10个样本,两两组合,共有45个距离。
    n=XT.shape[0]
    d1=[]
    for i in range(0,n):
        for j in range(i+1,n):
            delta=XT[i]-XT[j]
            d=np.sqrt(np.dot(np.dot(delta,SI),delta.T))
            d1.append(d)
    return d1
        
#方法二:根据scipy库求解
def Mahalanobis_distance_2(x,y):
    X=np.vstack([x,y])
    XT=X.T
    d2=pdist(XT,'mahalanobis')
    return d2

巴氏距离(Bhattacharyya Distance)

D_{B}(p, q)=-\ln (B C(p, q))
其中B C(p, q)=\sum_{x \in X} \sqrt{p(x) q(x)}

def bhattacharyya_distance_1(vector1, vector2):
    # 点与样本集的巴氏距离:
    BC = np.sum(np.sqrt(vector1 * vector2))
    return -np.log(BC)

汉明距离(Hamming Distance)

两个等长字符串s1与s2之间的汉明距离定义为将其中一个变为另外一个所需要作的最小替换次数。例如字符串“1111”与“1001”之间的汉明距离为2。

def hamming_distance_1(vector1, vector2):
    # 点与点的汉明距离
    return np.shape(np.nonzero(vector1 - vector2)[0])[0]
def hamming_distance_2(vector1, vector2):
    # 点与点的汉明距离
    return pdist(np.vstack((vector1, vector2)), 'hamming')

皮尔逊系数(Pearson Correlation Coefficient)

\rho=\frac{\operatorname{Cov}(X, Y)}{\sigma_{X} \sigma_{Y}}

from scipy.stats import pearsonr
x = [0.5, 0.4, 0.6, 0.3, 0.6, 0.2, 0.7, 0.5]
y = [0.6, 0.4, 0.4, 0.3, 0.7, 0.2, 0.5, 0.6]
pearsonr(x, y)
# 输出:(r, p)
# r:相关系数[-1,1]之间
# p:p值越小

信息熵(Informationentropy)

data1=np.array(['a','b','c','a','a','b'])
#计算信息熵的方法
def entropy(x):
    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    print(ent)
calc_ent(data1),data1

n是类别数,p\left(x_{i}\right)是第i类的概率
H=-\sum_{i=1}^{n} p\left(x_{i}\right) \log _{2} p\left(x_{i}\right)

夹角余弦(Cosine)

\cos (\theta)=\frac{A B}{|\mathrm{A} \| \mathrm{B}|} = \frac{\sum_{i=1}^{n} A_{i} \times B_{i}}{\sqrt{\sum_{i=1}^{n}\left(A_{i}\right)^{2}} \times \sqrt{\sum_{i=1}^{n}\left(B_{i}\right)^{2}}}

def Cosine_distance_1(vector1, vector2):
    # 点与点的夹角余弦距离
    return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
def Cosine_distance_2(vector1, vector2):
    # 点与点的夹角余弦距离
    return pdist(np.vstack((vector1, vector2)), 'cosine')

杰卡德相似系数(Jaccard similarity coefficient)

def jaccard_similarity_coefficient(vector1, vector2):
    # 点与点的杰卡德距离
    return dist.pdist(np.array([vector1, vector2]), 'jaccard')

经典贝叶斯公式

\mathrm{P}(\mathrm{B} | \mathrm{A})=\frac{P(\mathrm{A} | \mathrm{B}) \mathrm{P}(\mathrm{B})}{\mathrm{P}(\mathrm{A})}

堪培拉距离(Canberra Distance)

def canberra_distance_1(vector1, vector2):
    # 点与点的堪培拉距离
    return dist.pdist(np.array([vector1, vector2]), 'canberra')
上一篇下一篇

猜你喜欢

热点阅读