手机好文

第2章 无序多分类和有序多分类

2020-07-16  本文已影响0人  养猪场小老板
第2章
image.png
image.png 代码案例
数据形式
image.png
#第 2章 代码开始
> ## 利用 nnet 包中的函数multinom ,建立多元logistic回归模型:
> library(foreign)
> library(nnet)
> library(ggplot2)
> library(reshape2)
> ml <- read.dta("hsbdemo.dta")#读取.dta格式数据
> #View(ml)#查看数据的具体形式
> with(ml, table(ses, prog))#ses行和prog列的表格
        prog
ses      general academic vocation
  low         16       19       12
  middle      20       44       31
  high         9       42        7
> #with在数据框上的操作
> #do.call(操作,数据)
> with(ml, do.call(rbind, tapply(write, prog, function(x) c(M = mean(x), SD = sd(x)))))
                M       SD
general  51.33333 9.397775
academic 56.25714 7.943343
vocation 46.76000 9.318754
> ml$prog2 <- relevel(ml$prog, ref = "academic")#以academic作为参考
> test <- multinom(prog2 ~ ses + write, data = ml)#回归
# weights:  15 (8 variable)
initial  value 219.722458 
iter  10 value 179.982880
final  value 179.981726 
converged
> summary(test)
Call:
multinom(formula = prog2 ~ ses + write, data = ml)

Coefficients:
         (Intercept)  sesmiddle    seshigh      write
general     2.852198 -0.5332810 -1.1628226 -0.0579287
vocation    5.218260  0.2913859 -0.9826649 -0.1136037

#行general 和vocation是指相对于academic的值,列sesmiddle   seshigh是指相对于seslow的值

Std. Errors:
         (Intercept) sesmiddle   seshigh      write
general     1.166441 0.4437323 0.5142196 0.02141097
vocation    1.163552 0.4763739 0.5955665 0.02221996

Residual Deviance: 359.9635 
AIC: 375.9635 
> # 2-tailed z test
> z <- summary(test)$coefficients/summary(test)$standard.errors #Z值的计算
> z
         (Intercept)  sesmiddle   seshigh     write
general     2.445214 -1.2018081 -2.261334 -2.705562
vocation    4.484769  0.6116747 -1.649967 -5.112689
> p <- (1 - pnorm(abs(z), 0, 1)) * 2##P值的计算
> p
          (Intercept) sesmiddle    seshigh        write
general  0.0144766100 0.2294379 0.02373856 6.818902e-03
vocation 0.0000072993 0.5407530 0.09894976 3.176045e-07
#general 相对于academic,因素seshigh和write是有意义的
#vocation  相对于academic,因素write是有意义的
> # extract the coefficients from the model and exponentiate
> exp(coef(test)) #OR值
         (Intercept) sesmiddle   seshigh     write
general     17.32582 0.5866769 0.3126026 0.9437172
vocation   184.61262 1.3382809 0.3743123 0.8926116
#解释OR值,write每提高一个单位,产生general是产生academic的0.9437172倍
#write每提高一个单位,产生vocation是产生academic的0.8926116倍
#
> head(pp <- fitted(test))
   academic   general  vocation
1 0.1482764 0.3382454 0.5134781
2 0.1202017 0.1806283 0.6991700
3 0.4186747 0.2368082 0.3445171
4 0.1726885 0.3508384 0.4764731
5 0.1001231 0.1689374 0.7309395
6 0.3533566 0.2377976 0.4088458

> dses <- data.frame(ses = c("low", "middle", "high"), write = mean(ml$write))
#这里把写作write取均值,即,每个人的写作贡献固定了,只有社会经济地位的变化了
> predict(test, newdata = dses, "probs")#预测
   academic   general  vocation
1 0.4396845 0.3581917 0.2021238
2 0.4777488 0.2283353 0.2939159
3 0.7009007 0.1784939 0.1206054
> dwrite <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70),3))
> # store the predicted probabilities for each value of ses and write
> pp.write <- cbind(dwrite, predict(test, newdata = dwrite, type = "probs", se = TRUE))
> # calculate the mean probabilities within each level of ses
> by(pp.write[, 3:5], pp.write$ses, colMeans)
pp.write$ses: high
 academic   general  vocation 
0.6164315 0.1808037 0.2027648 
-------------------------------------------------------- 
pp.write$ses: low
 academic   general  vocation 
0.3972977 0.3278174 0.2748849 
-------------------------------------------------------- 
pp.write$ses: middle
 academic   general  vocation 
0.4256198 0.2010864 0.3732938 
> #melt data set to long for ggplot2
> lpp <- melt(pp.write, id.vars = c("ses", "write"), value.name = "probability")
> head(lpp)  # view first few rows
  ses write variable probability
1 low    30 academic  0.09843588
2 low    31 academic  0.10716868
3 low    32 academic  0.11650390
4 low    33 academic  0.12645834
5 low    34 academic  0.13704576
6 low    35 academic  0.14827643
> # plot predicted probabilities across write values for each level of ses
> # facetted by program type
> ggplot(lpp, aes(x = write, y = probability, colour = ses)) + geom_line() + facet_grid(variable ~., scales = "free")



> install.packages("mlogit")
> library(Formula)
> library(maxLik)
> library(miscTools)
> library(mlogit)
> data("Fishing", package = "mlogit")
> Fish <- mlogit.data(Fishing,shape = "wide",choice = "mode")
> summary(mlogit(mode ~ 0|income, data = Fish))

Call:
mlogit(formula = mode ~ 0 | income, data = Fish, method = "nr")

Frequencies of alternatives:choice
  beach    boat charter    pier 
0.11337 0.35364 0.38240 0.15059 

nr method
4 iterations, 0h:0m:0s 
g'(-H)^-1g = 8.32E-07 
gradient close to zero 

Coefficients :
                       Estimate  Std. Error z-value  Pr(>|z|)    
(Intercept):boat     7.3892e-01  1.9673e-01  3.7560 0.0001727 ***
(Intercept):charter  1.3413e+00  1.9452e-01  6.8955 5.367e-12 ***
(Intercept):pier     8.1415e-01  2.2863e-01  3.5610 0.0003695 ***
income:boat          9.1906e-05  4.0664e-05  2.2602 0.0238116 *  
income:charter      -3.1640e-05  4.1846e-05 -0.7561 0.4495908    
income:pier         -1.4340e-04  5.3288e-05 -2.6911 0.0071223 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Log-Likelihood: -1477.2
McFadden R^2:  0.013736 
Likelihood ratio test : chisq = 41.145 (p.value = 6.0931e-09)

2.2章 有序分类logistic回归 即 等级资料

有序分类logistic回归
案例

有可能把等级资料转换成二分类资料就转换,否则只能当作等级资料或多分类资料;


image.png

是否从学校毕业:不可能,有可能,极有可能;父母是否有学位:1表示有,0表示无;公立还是私立学校:1公里,0私立

## 程序包MASS提供polr()函数可以进行ordered logit或probit回归

require(foreign)
require(ggplot2)
require(MASS)
require(Hmisc)
require(reshape2)

dat <- read.dta("ologit.dta")
#View(dat)
head(dat)
## one at a time, table apply, pared, and public
lapply(dat[, c("apply", "pared", "public")], table)
## three way cross tabs (xtabs) and flatten the table
ftable(xtabs(~ public + apply + pared, data = dat))
summary(dat$gpa)
sd(dat$gpa)

ggplot(dat, aes(x = apply, y = gpa)) +
  geom_boxplot(size = .75) +
  geom_jitter(alpha = .5) +
  facet_grid(pared ~ public, margins = TRUE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

## fit ordered logit model and store results 'm'
m <- polr(apply ~ pared + public + gpa, data = dat, Hess=TRUE)
## view a summary of the model
summary(m)

## store table
(ctable <- coef(summary(m)))
## calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
## combined table
(ctable <- cbind(ctable, "p value" = p))

(ci <- confint(m)) # default method gives profiled CIs
confint.default(m) # CIs assuming normality
## odds ratios
exp(coef(m))
## OR and CI
exp(cbind(OR = coef(m), ci))


sf <- function(y) {
  c('Y>=1' = qlogis(mean(y >= 1)),
    'Y>=2' = qlogis(mean(y >= 2)),
    'Y>=3' = qlogis(mean(y >= 3)))
}
(s <- with(dat, summary(as.numeric(apply) ~ pared + public + gpa, fun=sf)))

glm(I(as.numeric(apply) >= 2) ~ pared, family="binomial", data = dat)

glm(I(as.numeric(apply) >= 3) ~ pared, family="binomial", data = dat)

s[, 4] <- s[, 4] - s[, 3]
s[, 3] <- s[, 3] - s[, 3]
s # print

plot(s, which=1:3, pch=1:3, xlab='logit', main=' ', xlim=range(s[,3:4]))

newdat <- data.frame(
  pared = rep(0:1, 200),
  public = rep(0:1, each = 200),
  gpa = rep(seq(from = 1.9, to = 4, length.out = 100), 4))
newdat <- cbind(newdat, predict(m, newdat, type = "probs"))

##show first few rows
head(newdat)

lnewdat <- melt(newdat, id.vars = c("pared", "public", "gpa"),
                variable.name = "Level", value.name="Probability")
## view first few rows
head(lnewdat)

ggplot(lnewdat, aes(x = gpa, y = Probability, colour = Level)) +
  geom_line() + facet_grid(pared ~ public, labeller="label_both")
上一篇下一篇

猜你喜欢

热点阅读