2018-12-04-机器学习作业-降维
2018-12-04 本文已影响0人
HollyMeng
数据集:https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
题目:
注:第三题若产生不连通图,就增加最近邻的数量,对数据集1,为6-NN
代码:
%% AML-2018作业——降维
clear all;clc;
data_train=importdata('./two_datasets/sonar-train.txt');
data_test=importdata('./two_datasets/sonar-test.txt');
% data_train=importdata('./two_datasets/splice-train.txt');
% data_test=importdata('./two_datasets/splice-test.txt');
[m_train,n_train]=size(data_train);
[m_test,n_test]=size(data_test);
x_train=data_train(:,1:n_train-1);
x_test=data_test(:,1:n_test-1);
label_train=data_train(:,n_train);
label_test=data_test(:,n_test);
%% PCA
KNN=1;
for k=10:10:30 % embed dimension
k
W=PCA_cm(x_train,k);
% W=pca(x_train); % 自带pca函数效果比自己写的差!
Z_train=x_train*W;
Z_test=x_test*W;
accuracy_PCA(k/10)=Evaluate_acc(Z_train,Z_test,label_train,label_test,KNN);
end
%% SVD
KNN=1;
for k=10:10:30
Dk_train=svd_cm(x_train,k);
Dk_test=svd_cm(x_test,k);
accuracy_SVD(k/10)=Evaluate_acc(Dk_train,Dk_test,label_train,label_test,KNN);
end
%% ISOMAP
KNN=6;
X=[x_train;x_test];
for k=10:10:30
Iso=ISOMAP(X,KNN,k);
I_train=Iso(1:m_train,:);
I_test=Iso(m_train+1:m_train+m_test,:);
accuracy_ISOMAP(k/10)=Evaluate_acc(I_train,I_test,label_train,label_test,1);
end
function U=PCA_cm(X,k)
[m,n]=size(X);
mu=mean(X);
C=(X'*X)/m-mu'*mu; %相当于去中心化
[u,s,v]=svd(C);
U=u(:,1:k);
function Dk=svd_cm(X,k)
[q,sigma,p]=svd(X);
D=q(:,1:k)*sigma(1:k,1:k)*p(:,1:k)'; %分别取前k列特征向量近似合成原数据
Dk=D*p(:,1:k);
function Z=ISOMAP(X,KNN,k)
[m,n]=size(X);
dist=zeros(m,m);
for i=1:m
for j=i+1:m
dist(i,j)=pdist([X(i,:);X(j,:)],'minkowski',2);
dist(j,i)=dist(i,j);
end
dist(i,i)=10000;
end
%%
s=[];
t=[];
w=[];
l=1;
for i=1:m
for j=1:KNN
index=find(dist(i,:)==min(dist(i,:)));
s(l)=i;
t(l)=index;
w(l)=dist(i,index);
l=l+1;
s(l)=index;
t(l)=i;
w(l)=w(l-1);
l=l+1;
dist(i,index)=1000;
end
end
G = digraph(s,t,w);
% p = plot(G,'EdgeLabel',G.Edges.Weight);
dist=zeros(m,m);
for i=1:m-1
for j=i+1:m
[path, d] = shortestpath(G,i,j);
dist(i,j)=d;
dist(j,i)=d;
end
end
% highlight(p, path,'EdgeColor','red')
%% MDS
dist_2=dist.^2;
dist_i2=mean(dist_2);
dist_j2=mean(dist_2,2);
dist_22=sum(sum(dist_2))/(m*m);
B=zeros(m,m);
for i=1:m
for j=1:m
B(i,j)=-0.5*(dist_2(i,j)-dist_i2(i)-dist_j2(j)+dist_22);
end
end
[u,sigma,v]=svd(B);
v_=v(:,1:k);
sigma_=sigma(1:k,1:k);
Z=v_*sqrt(sigma_);
% 1NN
% K-NN
function accuracy=Evaluate_acc(Z_train,Z_test,label_train,label_test,KNN)
predict=zeros(size(label_test));
[m_train,~]=size(Z_train);
[m_test,~]=size(Z_test);
for i=1:m_test
dist=zeros(1,m_train);
for j=1:m_train
dist(j)=pdist([Z_test(i,:);Z_train(j,:)],'minkowski',2);
end
dist_label=[dist',label_train];
dist_sort=sortrows(dist_label,1); %如果是-1代表对第一列按降序排序
if KNN==1
predict(i)=dist_sort(1,2);
else
vec=dist_sort(1:KNN,2);
result=tabulate(vec);
if result(1,3)==100
predict(i)=result(1,1);
else
result_sort=sortrows(result,-2);
predict(i)=result_sort(1,1);
end
end
end
accuracy=sum(label_test==predict)/m_test;
-
PCA
实验结果:accuracy
数据集1:[0.5825 0.5631 0.5631]
数据集2:[ 0.7582 0.7628 0.7356] -
SVD
实验结果:accuracy
数据集1:[0.6505 0.6019 0.6408]
数据集2:[ 0.5425 0.5113 0.5195]
3.ISOMAP
实验结果:accuracy
数据集1:[0.4563 0.4272 0.4175]
数据集2:未做,有bug