27-第一次R语言作业
0.安装并加载需要用到的R包
> library(pacman)
> p_load(tidyverse,openxlsx)
1.请将数据读入R,并指出各个变量的格式
> df <- read.xlsx("C:/Users/Admin/Documents/R/hw/hw1.xlsx",rowNames = TRUE,sheet = 1,startRow = 1) %>% tbl_df()
> #查看数据结构,显示所有列数据类型
> str(df)
Result: Classes 'tbl_df', 'tbl' and 'data.frame': 200 obs. of 7 variables:
Result: $ Age : num 32.5 34.6 37.7 28.7 32.6 ...
Result: $ Years_at_Employer: num 9.39 11.97 12.46 1.39 7.49 ...
Result: $ Years_at_Address : num 0.2976 1.4851 0.0854 1.8376 0.2341 ...
Result: $ Income : num 37844 65765 61002 19953 24970 ...
Result: $ Credit_Card_Debt : num -3247 -15598 -11402 -1233 -1136 ...
Result: $ Automobile_Debt : num -4795 -17632 -7910 -2408 -397 ...
Result: $ Is_Default : num 0 1 NA 0 0 1 0 0 0 0 ...
结论:
样本数据集共200条记录,8个变量,全部为数值型(num)变量。
2.请作出Income对Years_at_Employer的散点图,你发现了哪些趋势和现象
> ggplot(df, aes(Income, Years_at_Employer)) +
+ geom_point(na.rm = T, col = "dodgerblue", size = 1) +
+ labs(x = "Income(ten thousands)",
+ y = "Years_at_Employer(years)",
+ title = "") +
+ scale_x_continuous(breaks = c(0e+00, 1e+05, 2e+05, 3e+05, 4e+05),
+ labels = c("0", "10", "20", "30", "40")) +
+ theme_minimal()
散点图
结论:
(1)人员Income主要集中在100000以下,Years_at_Employer主要集中在15年以下;
(2)Income与Years_at_Employer大体成正相关关系。
3.请找出Years_at_Employer至Automobile_Debt列的缺失值,并删除相应的行
> #查看Years_at_Employer至Automobile_Debt列的列标
> names(df)
Result: [1] "Age" "Years_at_Employer" "Years_at_Address" "Income"
Result: [5] "Credit_Card_Debt" "Automobile_Debt" "Is_Default"
> #提取Years_at_Employer至Automobile_Debt列
> df1 <- df[,2:7]; names(df1)
Result: [1] "Years_at_Employer" "Years_at_Address" "Income" "Credit_Card_Debt"
Result: [5] "Automobile_Debt" "Is_Default"
> #删除有缺失值的行
> df1 <- na.omit(df1)
>
> #统计还有多少个缺失值
> print(paste0("该数据集还有", as.character(sum(is.na(df1))), "个缺失值。"))
Result: [1] "该数据集还有0个缺失值。"
4.找出Income中的极端值并滤掉对应行的数据
> ggplot(df1, aes(y = Income)) +
+ geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 2,fill="dodgerblue") +
+ labs(title = "", y = "Income(ten thousands)") +
+ coord_flip() +
+ scale_y_continuous(breaks = c(0e+00, 1e+05, 2e+05, 3e+05, 4e+05),
+ labels = c("0", "10", "20", "30", "40")) +
+ theme_minimal() +
+ theme(plot.title = element_text(hjust = 0.5))
箱线图
> p_load(pander)
> # 筛选出极端值之外的所有值,并另存为数据集df2
> df2 <- subset(df1, Income <= 100000)
> pander(summary(df2$Income))
----------------------------------------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
------- --------- -------- ------- --------- -------
11522 22928 31945 37878 48850 97958
----------------------------------------------------
5.将income对数化,并画出直方图和density curve
> ggplot(df2, aes(Income)) +
+ geom_histogram(aes(y = ..density..), bins = 40, col = "white", fill = "dodgerblue") +
+ scale_x_log10() +
+ geom_density(size = 1, col = "violetred") +
+ labs(y=NULL) +
+ theme_minimal()
直方图
6.以Income作为因变量,Years_at_ Employer 作为自变量,进行OLS回归,写出回归的方程,并指出自变量系数是否在某一显著性水平上显著。同时,解释你的结果
> fit1 <- lm(Income ~ Years_at_Employer, df2)
>
> # 回归诊断
> p_load(gvlma)
> gvmodel <- gvlma(fit1)
> pander(summary(gvmodel))
Call:
lm(formula = Income ~ Years_at_Employer, data = df2)
Residuals:
Min 1Q Median 3Q Max
-25903 -10876 -1867 6585 48108
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18903.2 1884.4 10.03 <2e-16 ***
Years_at_Employer 2445.5 195.7 12.50 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 14680 on 171 degrees of freedom
Multiple R-squared: 0.4774, Adjusted R-squared: 0.4743
F-statistic: 156.2 on 1 and 171 DF, p-value: < 2.2e-16
ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
Level of Significance = 0.05
Call:
gvlma(x = fit1)
Value p-value Decision
Global Stat 33.09678 1.141e-06 Assumptions NOT satisfied!
Skewness 25.92822 3.544e-07 Assumptions NOT satisfied!
Kurtosis 6.99600 8.169e-03 Assumptions NOT satisfied!
Link Function 0.13181 7.166e-01 Assumptions acceptable.
Heteroscedasticity 0.04074 8.400e-01 Assumptions acceptable.
---------------------------------------------------------------------------
Value p-value Decision
------------------------ --------- ----------- ----------------------------
**Global Stat** 33.1 1.141e-06 Assumptions NOT satisfied!
**Skewness** 25.93 3.544e-07 Assumptions NOT satisfied!
**Kurtosis** 6.996 0.008169 Assumptions NOT satisfied!
**Link Function** 0.1318 0.7166 Assumptions acceptable.
**Heteroscedasticity** 0.04074 0.84 Assumptions acceptable.
---------------------------------------------------------------------------
结论:
(1)自变量系数和截距P值显著,说明拒绝原假设(Income和Years_at_Employer无线性关系),即Income与Years_at_Employer线性相关;
(2)回归方程为:Income = 18903.2 + 2445.5 * Years_at_Employer;
(3)修正后的R2值为0.4743,并不理想。而且回归诊断的Global Stat显示Assumptions NOT satisfied!说明回归模型违反了线性回归的部分假设条件。
7.另一方面,请随机取样20个,同问题7一样,做OLS回归。然后将该过程做100000次。你会得到100000个估计值,然后画出直方图,你发现了什么
> # 随机取样20个,并做OLS回归
> my_sample <- df2[sample(1:nrow(df2), 20, replace = T), c(1, 3)]
> fit2 <- lm(Income ~ Years_at_Employer, my_sample)
> pander(coef(fit2))
---------------------------------
(Intercept) Years_at_Employer
------------- -------------------
18506 2690
---------------------------------
7.1 使用for循环
> timestart <- Sys.time()
> #将上面过程做100000次,提取出截距项和系数的所有值
> intercept <- vector("double", 100000)
> coefficient <- vector("double", 100000)
> for (i in 1:100000) {
+ my_sample <- df2[sample(1:nrow(df2), 20, replace = T), c(1, 3)]
+ fit <- lm(Income ~ Years_at_Employer, data = my_sample)
+ intercept[i] <- coef(fit)[1]
+ coefficient[i] <- coef(fit)[2]
+ }
> #将值按列合并为数据框,并保存为数据集df3
> df3 <- as.data.frame(cbind(intercept, coefficient)); head(df3, n = 3)
Result: intercept coefficient
Result: 1 17693.46 2275.481
Result: 2 20724.79 1535.688
Result: 3 15767.28 2854.777
> timeend <- Sys.time()
> runningtime <- timeend - timestart
> print(runningtime)
Result: Time difference of 2.481035 mins
7.2 使用多线程
> #使用多线程计算for循环
> p_load(parallel,foreach,doParallel)
> timestart <- Sys.time()
>
> #查看系统核心数
> num_cores <- detectCores()
> #创建并注册集群
> clu <- makeCluster(num_cores)
> registerDoParallel(clu)
>
> foreach(i = 1:100000, .combine = rbind) %dopar% {
+ my_sample <- df2[sample(1:nrow(df2), 20, replace = T),c(1, 3)]
+ fit <- lm(Income ~ Years_at_Employer, data = my_sample)
+ values <- coef(fit)
+ intercept[i] <- values[1]
+ coefficient[i] <- values[2]
+ }
> #将值按列合并为数据框,并保存为数据集df3
> df3 <- as.data.frame(cbind(intercept, coefficient)); head(df3, n = 3)
Result: intercept coefficient
Result: 1 17693.46 2275.481
Result: 2 20724.79 1535.688
Result: 3 15767.28 2854.777
> #结束集群
> stopCluster(clu)
>
> timeend <- Sys.time()
> runningtime <- timeend - timestart
> print(runningtime)
Result: Time difference of 2.096281 mins
多线程比for循环快了约0.4分钟,大概24秒钟。
> #截距估计值的直方图,自定义直方图的组距(binwidth = )和分组数量(bins = )
> ggplot(df3, aes(intercept), binwidth = 1) +
+ #密度曲线的纵轴范围为0~1,而直方图的纵轴范围0~7.5e-5。需将直方图中的y值改为..density..
+ geom_histogram(aes(y = ..density..), bins = 100, fill = "dodgerblue") +
+ geom_density(size = 1, col = "violetred") +
+ geom_vline(xintercept = mean(intercept), col = "red", size = 1, linetype = 2) +
+ labs(title = "截距项值的分布情况",y="",x="") +
+ theme_minimal() +
+ theme(plot.title = element_text(hjust = 0.5))
截距项
> #检验是否为正态分布
> smp1 <- sample(intercept, 50, replace = T)
> shapiro.test(smp1)
Result:
Result: Shapiro-Wilk normality test
Result:
Result: data: smp1
Result: W = 0.97225, p-value = 0.2851
> #求平均值和方差
> (intercept_mean <- mean(intercept))
Result: [1] 18695.28
> (intercept_sd <- sd(intercept))
Result: [1] 4881.072
> #系数估计值的直方图
> ggplot(df3, aes(coefficient), binwidth = 1) +
+ #密度曲线的纵轴范围为0~1,而直方图的纵轴范围0~7.5e-5。需将直方图中的y值改为..density..
+ geom_histogram(aes(y = ..density..), bins = 100, fill = "dodgerblue") +
+ geom_density(size = 1, col = "violetred") +
+ geom_vline(xintercept = mean(coefficient), col = "red", size = 1, linetype = 2) +
+ labs(title = "系数值的分布情况",y="",x="") +
+ theme_minimal() +
+ theme(plot.title = element_text(hjust = 0.5))
系数估计值
> #检验是否为正态分布
> smp2 <- sample(coefficient, 50, replace = T)
> shapiro.test(smp2)
Result:
Result: Shapiro-Wilk normality test
Result:
Result: data: smp2
Result: W = 0.96673, p-value = 0.1698
> #求平均值和方差
> (coefficient_mean <- mean(coefficient))
Result: [1] 2466.919
> (coefficient_sd <- sd(coefficient))
Result: [1] 627.0743
结论:
(1)截距项和系数的分布近似为正态分布;
(2)通过shapiro.test()函数检验,随机取样50个数据,intercept的w值接近1,p值大于0.05,所以截距项近似呈正态分布;因为截距项中包括残差和截距,截距为常数,残差呈正态分布,所以总体也呈正态分布;
(3)通过shapiro.test()函数检验,随机取样50个数据,coefficient的w值接近1,p值大于0.05,所以系数也近似呈正态分布;
(4)与回归方程Income = 18903.2 + 2445.5 * Years_at_Employer大体保持一致。
8.Is_Default变量是0,1变量,1代表此人违约,0代表没有。请选择合适的模型进行回归,并解释你的理由。请解释你的模型的结果和发现
> #删除缺失值
> df4 <- na.omit(df)
> #根据是否违约画图
> ggplot(df4, aes(as.factor(Is_Default))) +
+ geom_bar(fill = c(3,2), width = 0.5, stat = "count") +
+ labs(x = "违约情况", y = "", title = "违约情况对比图") +
+ theme_minimal() +
+ theme(plot.title = element_text(hjust = 0.5))
违约情况
> #统计违约和未违约数量,并计算违约率
> ratio <- sum(df4$Is_Default == "1") / nrow(df4); ratio
Result: [1] 0.2393617
结论:
违约人数高达总人数的24%!
> #相关性检测
> p_load(corrplot)
> cor1 <- cor(df4)
> corrplot(cor1, method = "number", type = "upper", addgrid.col = "black",
+ tl.col = "black", tl.pos = "td", tl.srt = 30, tl.offset = 0.5)
相关性
结论:
总体来说违约与否跟其他因素的相关性都不强,是否违约跟Years_at_Employer和Credit_Card_Debt有较弱的负相关性,而Years_at_Employer跟Income有较强的正相关性;Credit_Card_Debt和Income有较强的负相关性。
> #逻辑回归分析
> df4$Is_Default = as.factor(df4$Is_Default); mode(df4$Is_Default)
Result: [1] "numeric"
> glm <- glm(Is_Default ~ .,
+ family = binomial(link = "logit"),
+ data = df4)
>
> #逐步寻优法
> logit_step <- step(glm, direction = c("both"))
Result: Start: AIC=160.56
Result: Is_Default ~ Age + Years_at_Employer + Years_at_Address + Income +
Result: Credit_Card_Debt + Automobile_Debt
Result:
Result: Df Deviance AIC
Result: - Age 1 147.12 159.12
Result: - Years_at_Address 1 148.07 160.07
Result: <none> 146.56 160.56
Result: - Automobile_Debt 1 149.33 161.33
Result: - Income 1 151.74 163.74
Result: - Credit_Card_Debt 1 169.37 181.37
Result: - Years_at_Employer 1 174.58 186.58
Result:
Result: Step: AIC=159.12
Result: Is_Default ~ Years_at_Employer + Years_at_Address + Income +
Result: Credit_Card_Debt + Automobile_Debt
Result:
Result: Df Deviance AIC
Result: - Years_at_Address 1 148.75 158.75
Result: <none> 147.12 159.12
Result: - Automobile_Debt 1 149.85 159.85
Result: + Age 1 146.56 160.56
Result: - Income 1 152.88 162.88
Result: - Credit_Card_Debt 1 169.94 179.94
Result: - Years_at_Employer 1 181.45 191.45
Result:
Result: Step: AIC=158.75
Result: Is_Default ~ Years_at_Employer + Income + Credit_Card_Debt +
Result: Automobile_Debt
Result:
Result: Df Deviance AIC
Result: <none> 148.75 158.75
Result: + Years_at_Address 1 147.12 159.12
Result: - Automobile_Debt 1 151.78 159.78
Result: + Age 1 148.07 160.07
Result: - Income 1 154.25 162.25
Result: - Credit_Card_Debt 1 171.34 179.34
Result: - Years_at_Employer 1 182.77 190.77
> pander(summary(logit_step))
-----------------------------------------------------------------------
Estimate Std. Error z value Pr(>|z|)
----------------------- ------------ ------------ --------- -----------
**(Intercept)** -0.3632 0.3256 -1.116 0.2646
**Years_at_Employer** -0.261 0.05456 -4.784 1.715e-06
**Income** -2.071e-05 8.221e-06 -2.519 0.01178
**Credit_Card_Debt** -0.0004274 0.0001052 -4.064 4.826e-05
**Automobile_Debt** -8.628e-05 4.865e-05 -1.773 0.07616
-----------------------------------------------------------------------
(Dispersion parameter for binomial family taken to be 1 )
-------------------- ---------------------------
Null deviance: 206.9 on 187 degrees of freedom
Residual deviance: 148.7 on 183 degrees of freedom
-------------------- ---------------------------
> #前向选择法
> logit_forward <- step(glm, direction = c("forward"))
> summary(logit_forward)
> #后向选择法
> logit_backward <- step(glm, direction = c("backward"))
> summary(logit_backward)
结论:
从模型结果可知,三种方法的模型结果基本保持一致:Years_at_Employer、Income、Credit_Card_Debt、Automobile_Debt与违约负相关(其中Credit_Card_Debt和Automobile_Debt为负数表示)。
9.此时,一个32岁,Years_at_Employer为11,Years_at_Address为1.5,Income为50000,信用卡债务为5000元,车债为2000元的人前来银行借款,请用你的模型预测他是否会违约
9.1 方法一:使用逻辑回归模型
> #提取出相关列并将数据拆分为训练集和测试集
> require(caTools)
> my_split <- sample.split(df4$Is_Default, SplitRatio = 0.8)
> train <- subset(df4, my_split == TRUE)
> test <- subset(df4, my_split == FALSE)
> #在训练集上构建模型
> glm2 <- glm(as.factor(Is_Default) ~ .,
+ family = binomial(link = "logit"),
+ data = train)
> pre <- predict(glm2, newdata = test[,-7], type='response')
> #绘制ROC曲线
> require(pROC)
> modelroc <- roc(test$Is_Default, pre)
> plot(modelroc, print.auc=TRUE, auc.polygon=TRUE,
+ grid=c(0.1, 0.2),
+ grid.col=c("green", "red"),
+ max.auc.polygon=TRUE, #整个图像填充
+ auc.polygon.col="dodgerblue", print.thres=TRUE,
+ main=" ROC曲线",
+ smooth = T #使ROC曲线变得光滑
+ )
ROC曲线
结论:
该模型的AUC为0.789,说明该模型的准确性不是很理想。
计算准确率:
> pre.lg <- ifelse(pre>0.5,1,0)
> test_logit <- cbind(test=test$Is_Default, pre=pre.lg) %>% as.data.frame()
> #统计预测错误的数量
> error <- test_logit$test - test_logit$pre
> #统计预测正确的数量占比
> accuracy = (nrow(test_logit) - sum(abs(error))) / nrow(test_logit) * 100
> print(paste0("该逻辑回归模型的预测准确率为:", round(accuracy, 1), "%"))
Result: [1] "该逻辑回归模型的预测准确率为:73.7%"
> #将这个人的数据加入数据集test_data
> test_data <- data.frame(Age = 32,
+ Years_at_Employer = 11,
+ Years_at_Address = 1.5,
+ Income = 50000,
+ Credit_Card_Debt = -5000,
+ Automobile_Debt = -2000)
> #在预测数据集中,概率大于0.5,违约;概率小于0.5,不违约
> pre2 <- predict(glm2, test_data, type = "response"); pre2
Result: 1
Result: 0.1321743
结论:
预测概率为0.1321743 < 0.5,所以他不会违约。
9.2 方法二:使用神经网络模型
> #神经网络模型预测
> require(nnet)
> # 隐蔽单元个数(size),初始随机数权值(rang),权值衰减参数(decay),最大迭代次数(maxit)
> mymodel <- nnet(as.factor(Is_Default) ~ ., data = train,size = 5,rang = 0.1,decay = 5e-4,maxit = 200)
Result: # weights: 41
Result: initial value 108.098787
Result: iter 10 value 76.656970
Result: iter 20 value 71.952643
Result: iter 30 value 71.951296
Result: iter 40 value 71.308911
Result: iter 50 value 69.773692
Result: iter 60 value 58.869826
Result: iter 70 value 55.627422
Result: iter 80 value 53.932401
Result: iter 90 value 53.670252
Result: iter 100 value 53.577037
Result: iter 110 value 53.234662
Result: iter 120 value 53.195270
Result: iter 130 value 52.764740
Result: iter 140 value 52.738319
Result: iter 150 value 52.202231
Result: iter 160 value 52.030954
Result: iter 170 value 51.914474
Result: iter 180 value 51.612487
Result: iter 190 value 51.320929
Result: iter 200 value 51.226467
Result: final value 51.226467
Result: stopped after 200 iterations
> pre3 <- predict(mymodel, newdata = test, type = "class"); pre3
Result: [1] "0" "1" "0" "1" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1"
Result: [23] "0" "1" "0" "0" "0" "1" "1" "1" "0" "0" "1" "0" "0" "0" "0" "0"
> p_load(e1071)
> conf <- confusionMatrix(as.factor(pre3),reference = as.factor(test$Is_Default))
> print(paste0("该逻辑回归模型的预测准确率为:", round(conf$overall[[1]] * 100, 1), "%"))
Result: [1] "该逻辑回归模型的预测准确率为:84.2%"
> # 使用模型预测
> result <- predict(mymodel, newdata = test_data, type = "class");result
Result: [1] "0"
结论:
结果为“0”,所以他不会违约。