R语言模型

数量生态学笔记||聚类外部验证

2018-08-02  本文已影响0人  周运来就是我

对每一种聚类算法的结果我们都会遇到一个真实的问题:就是这个结果能否反映真实的生态情况。我们已经看到内部准则(如轮廓法或其他聚类质量指数) 都仅仅是依赖物种数据,还不足以选择最佳聚类结果。选择最终的聚类结果有时也需要基于生态学解释。生态学解释可视为样方聚类的外部验证。

利用外部独立的解释变量验证聚类结果(作为响应数据)可以用判别式分析。当然也可以将样方分组当做因子对解释变量(作为响应数据)进行方差分析,了解解释变量在各组间是否有显著差异。

# 加载所需的程序包
library(ade4)
library(vegan)  # 应该先加载ade4后加载vegan以避免冲突
library(gclus)
library(cluster)
library(RColorBrewer)
library(labdsv)
library(mvpart)
library(MVPARTwrap) # MVPARTwrap这个程序包必须从本地zip文件安装
# 导入CSV格式的数据

rm(list = ls())
setwd("D:\\Users\\Administrator\\Desktop\\RStudio\\数量生态学\\DATA")

spe <- read.csv("DoubsSpe.csv", row.names=1)
env <- read.csv("DoubsEnv.csv", row.names=1)
spa <- read.csv("DoubsSpa.csv", row.names=1)
# 删除无物种数据的样方8
spe <- spe[-8,]
env <- env[-8,]
spa <- spa[-8,]
# 物种多度数据:先计算样方之间的弦距离矩阵,然后进行单连接聚合聚类
# **************************************************************
spe.norm <- decostand(spe, "normalize")
spe.ch <- vegdist(spe.norm, "euc")
spe.ch.single <- hclust(spe.ch, method="single")
# 预转化后物种数据k-均值划分
# ****************************
spe.kmeans <- kmeans(spe.norm, centers=4, nstart=100)

下面的案例展示一样方聚类簇为因子去对解释变量进行方差分析。首先检验某一环境变量是否符合方差分析假设(残差正态性和方差齐性),然后用传统的单因素方差分析或非参数的Kruskal-Wallis检验解释变量在组间是否有显著差异。

# 基于k-均值划分结果(分4组)的样方聚类簇与4个环境变量的关系
# *************************************************************
attach(env)
# 定量环境变量箱线图
# 海拔、坡度、氧含量和铵浓度
par(mfrow=c(2,2))
boxplot(sqrt(alt) ~ spe.kmeans$cluster, main="海拔", las=1, 
  ylab="sqrt(alt)", col=2:5, varwidth=TRUE)
boxplot(log(pen) ~ spe.kmeans$cluster, main="坡度", las=1, 
  ylab="log(pen)", col=2:5, varwidth=TRUE)
boxplot(oxy ~ spe.kmeans$cluster, main="氧含量", las=1, 
  ylab="oxy", col=2:5, varwidth=TRUE)
boxplot(sqrt(amm) ~ spe.kmeans$cluster, main="铵浓度", las=1, 
  ylab="sqrt(amm)", col=2:5, varwidth=TRUE)
> # 检验方差分析的假设
> # 检验残差的正态性
> shapiro.test(resid(lm(sqrt(alt) ~ as.factor(spe.kmeans$cluster))))

    Shapiro-Wilk normality test

data:  resid(lm(sqrt(alt) ~ as.factor(spe.kmeans$cluster)))
W = 0.96238, p-value = 0.3756

> shapiro.test(resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster))))

    Shapiro-Wilk normality test

data:  resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster)))
W = 0.95182, p-value = 0.204

> shapiro.test(resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster))))

    Shapiro-Wilk normality test

data:  resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster)))
W = 0.95182, p-value = 0.204

> shapiro.test(resid(lm(oxy ~ as.factor(spe.kmeans$cluster))))

    Shapiro-Wilk normality test

data:  resid(lm(oxy ~ as.factor(spe.kmeans$cluster)))
W = 0.93941, p-value = 0.09674

> shapiro.test(resid(lm(sqrt(amm) ~ as.factor(spe.kmeans$cluster))))

    Shapiro-Wilk normality test

data:  resid(lm(sqrt(amm) ~ as.factor(spe.kmeans$cluster)))
W = 0.95398, p-value = 0.2319

> #检验结果表明sqrt(alt)、log(pen)、oxy和sqrt(amm)的残差是正态分布。
> #也尝试为其他的环境变量寻找好的标准化转化。
> # 检验方差齐性
> bartlett.test(sqrt(alt), as.factor(spe.kmeans$cluster))

    Bartlett test of homogeneity of variances

data:  sqrt(alt) and as.factor(spe.kmeans$cluster)
Bartlett's K-squared = 16.953, df = 3, p-value = 0.0007227

> bartlett.test(log(pen), as.factor(spe.kmeans$cluster))

    Bartlett test of homogeneity of variances

data:  log(pen) and as.factor(spe.kmeans$cluster)
Bartlett's K-squared = 2.9832, df = 3, p-value = 0.3942

> bartlett.test(oxy, as.factor(spe.kmeans$cluster))

    Bartlett test of homogeneity of variances

data:  oxy and as.factor(spe.kmeans$cluster)
Bartlett's K-squared = 1.8258, df = 3, p-value = 0.6093

> bartlett.test(sqrt(amm), as.factor(spe.kmeans$cluster))

    Bartlett test of homogeneity of variances

data:  sqrt(amm) and as.factor(spe.kmeans$cluster)
Bartlett's K-squared = 5.7203, df = 3, p-value = 0.126

> #变量sqrt(alt)的方差不齐,所以参数检验的方差分析不适用
> #可被参数检验的变量的方差分析 
> summary(aov(log(pen) ~ as.factor(spe.kmeans$cluster)))
                              Df Sum Sq Mean Sq F value  Pr(>F)   
as.factor(spe.kmeans$cluster)  3  18.05   6.018   7.283 0.00114 **
Residuals                     25  20.66   0.826                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(aov(oxy ~ as.factor(spe.kmeans$cluster)))
                              Df Sum Sq Mean Sq F value   Pr(>F)    
as.factor(spe.kmeans$cluster)  3 103.54   34.51   26.27 6.75e-08 ***
Residuals                     25  32.84    1.31                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(aov(sqrt(amm) ~ as.factor(spe.kmeans$cluster)))
                              Df Sum Sq Mean Sq F value   Pr(>F)    
as.factor(spe.kmeans$cluster)  3 2.6126  0.8709   49.55 1.15e-10 ***
Residuals                     25 0.4394  0.0176                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> #坡度、含氧量和铵浓度在不同聚类簇之间是否差异显著?
> # 变量alt的Kruskal-Wallis检验
> kruskal.test(alt ~ as.factor(spe.kmeans$cluster))

    Kruskal-Wallis rank sum test

data:  alt by as.factor(spe.kmeans$cluster)
Kruskal-Wallis chi-squared = 21.526, df = 3, p-value = 8.186e-05

> #海拔在不同聚类簇之间是否有显著差异?
> detach(env)
双类型比较(列联表分析)
> # 两种聚类的列联表 
> # ******************
> # 基于环境变量(见第2章)的样方聚类
> env2 <- env[,-1]
> env.de <- vegdist(scale(env2), "euc")
> env.kmeans <- kmeans(env.de, centers=4, nstart=100)
> env.KM.4 <- env.kmeans$cluster
> # 比较两种聚类的结果
> table(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster))
   
    1 2 3 4
  1 0 4 7 1
  2 1 0 0 2
  3 5 3 0 0
  4 0 4 2 0
> #两种聚类结果是否相同?
> # 用卡方检验分析两种聚类之间的差异 
> chisq.test(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster))

    Pearson's Chi-squared test

data:  as.factor(spe.kmeans$cluster) and as.factor(env.kmeans$cluster)
X-squared = 30.227, df = 9, p-value = 0.0004014

Warning message:
In chisq.test(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster)) :
  Chi-squared近似算法有可能不准
> # 改用置换的方法进行卡方检验分析 
> chisq.test(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster), 
+   simulate.p.value=TRUE)

    Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)

data:  as.factor(spe.kmeans$cluster) and as.factor(env.kmeans$cluster)
X-squared = 30.227, df = NA, p-value = 0.0004998

> # k-均值样方聚类簇平均多度
> # ***********************
> groups <- as.factor(spe.kmeans$cluster)
> as.factor(spe.kmeans$cluster)
 1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
 1  1  1  1  4  1  1  4  1  1  1  1  1  1  4  4  4  4  3  3  3  2  2  2  3  3  3  3  3 
Levels: 1 2 3 4
> as.factor(env.kmeans$cluster)
 1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
 4  3  3  3  3  3  3  3  3  2  2  2  2  3  2  2  2  2  2  2  2  4  1  4  1  1  1  1  1 
Levels: 1 2 3 4
上一篇下一篇

猜你喜欢

热点阅读