R,笔记04
2018-12-16 本文已影响0人
按着易得
数据处理
> # 抽取数据去重复
> de_dup <- function()
+ {
+ i <- which(duplicated(iris))
+ x <- iris[-i, ]
+ #print(x)
+ }
> head(de_dup())
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
#或者
iris[!duplicated(iris), ]
去掉NA
> head(airquality[complete.cases(airquality), ])
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
> # 或者na.omit(airquality)
with() identical() within()函数
> # 用with()函数计算鸢尾花,花萼与花瓣的长度比
> rwith <- with(iris, Sepal.Length / Petal.Length)
> head(rwith)
[1] 3.642857 3.500000 3.615385 3.066667 3.571429 3.176471
>
> # identical()基本作用是检测两个对象是否完全相同,相同返回T,否则,F
>
> # within函数与with类似,但主要用于列运算,将运算结果放入新列
> myiris <- iris # 不破坏内建数据集
> myiris <- within(myiris, lenth.ratio <- Sepal.Length / Petal.Length)
> head(myiris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species lenth.ratio
1 5.1 3.5 1.4 0.2 setosa 3.642857
2 4.9 3.0 1.4 0.2 setosa 3.500000
3 4.7 3.2 1.3 0.2 setosa 3.615385
4 4.6 3.1 1.5 0.2 setosa 3.066667
5 5.0 3.6 1.4 0.2 setosa 3.571429
6 5.4 3.9 1.7 0.4 setosa 3.176471
分割数据
> # 分割数据
> # cut()将数据等量切割,处理后的数据是factor数据型态
> # 将state.77对象依人口数做分割,分成5等份
> popu <- state.x77[, "Population"]
> cutpopu <- cut(popu, 5)
> head(cutpopu)
[1] (344,4.53e+03] (344,4.53e+03] (344,4.53e+03] (344,4.53e+03] (1.7e+04,2.12e+04] (344,4.53e+03]
Levels: (344,4.53e+03] (4.53e+03,8.7e+03] (8.7e+03,1.29e+04] (1.29e+04,1.7e+04] (1.7e+04,2.12e+04]
> #分割时,按人口数由多到少,分别给予名称"high" "2nd" "3rd" "4th" "low"
> cut(popu, 5, labels = c ("high", "2nd", "3rd", "4th", "low"))
[1] high high high high low high high high 2nd 2nd high high 3rd 2nd high high high high high high 2nd 3rd high high 2nd high high high high 2nd high low
[33] 2nd high 3rd high high 3rd high high high high 3rd high high 2nd high high 2nd high
Levels: high 2nd 3rd 4th low
>
> #要了解每一人口数分类有多少州
> x.popu <- cut(popu, 5, labels = c ("high", "2nd", "3rd", "4th", "low"))
> table(x.popu)
x.popu
high 2nd 3rd 4th low
34 9 5 0 2
合并数据
准备数据库
> mystates.x77 <- as.data.frame(state.x77)
> mystates.x77$name <- rownames(state.x77) # 给新数据增加一个字段name
> head(mystates.x77)
Population Income Illiteracy Life Exp Murder HS Grad Frost Area name
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 Alabama
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 Alaska
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 Arizona
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 Arkansas
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 California
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 Colorado
> row.names(mystates.x77) <- NULL # 删除原来行名
> head(mystates.x77)
Population Income Illiteracy Life Exp Murder HS Grad Frost Area name
1 3615 3624 2.1 69.05 15.1 41.3 20 50708 Alabama
2 365 6315 1.5 69.31 11.3 66.7 152 566432 Alaska
3 2212 4530 1.8 70.55 7.8 58.1 15 113417 Arizona
4 2110 3378 1.9 70.66 10.1 39.9 65 51945 Arkansas
5 21198 5114 1.1 71.71 10.3 62.6 20 156361 California
6 2541 4884 0.7 72.06 6.8 63.9 166 103766 Colorado
> #人口大于500万的选出来(原单位是千人数),同时新对象要有2个字段name 和 population
> mypopu.states <- mystates.x77[mystates.x77$Population > 5000, c("name", "Population")]
> mypopu.states
name Population
5 California 21198
9 Florida 8277
13 Illinois 11197
14 Indiana 5313
21 Massachusetts 5814
22 Michigan 9111
30 New Jersey 7333
32 New York 18076
33 North Carolina 5441
35 Ohio 10735
38 Pennsylvania 11860
43 Texas 12237
> #选出月收入大于5000美元的。同时新对象要有2个字段name 和 Income
> myincomes.states <- mystates.x77[mystates.x77$Income > 5000, c("name", "Income")]
> myincomes.states
name Income
2 Alaska 6315
5 California 5114
7 Connecticut 5348
13 Illinois 5107
20 Maryland 5299
28 Nevada 5149
30 New Jersey 5237
34 North Dakota 5087
merge
> # merge()交集合并。merge(x, y, all = F),默认是交接合并
> # 合并上述两个数据中人数超500万的州和月收入超5000美元的州
> merge(mypopu.states, myincomes.states)
name Population Income
1 California 21198 5114
2 Illinois 11197 5107
3 New Jersey 7333 5237
>
> # 取并集
> merge(mypopu.states, myincomes.states, all = T)
name Population Income
1 Alaska NA 6315
2 California 21198 5114
3 Connecticut NA 5348
4 Florida 8277 NA
5 Illinois 11197 5107
6 Indiana 5313 NA
7 Maryland NA 5299
8 Massachusetts 5814 NA
9 Michigan 9111 NA
10 Nevada NA 5149
11 New Jersey 7333 5237
12 New York 18076 NA
13 North Carolina 5441 NA
14 North Dakota NA 5087
15 Ohio 10735 NA
16 Pennsylvania 11860 NA
17 Texas 12237 NA
>
> # merge参数all.x = T, 保证第一个对象的元素在合并中都存在,第二个如没有则NA填充
> merge(mypopu.states, myincomes.states, all.x = T)
name Population Income
1 California 21198 5114
2 Florida 8277 NA
3 Illinois 11197 5107
4 Indiana 5313 NA
5 Massachusetts 5814 NA
6 Michigan 9111 NA
7 New Jersey 7333 5237
8 New York 18076 NA
9 North Carolina 5441 NA
10 Ohio 10735 NA
11 Pennsylvania 11860 NA
12 Texas 12237 NA
match
> # match()类似于取两个对象交集,即第一对象x的某行数据若在第二个对象y中找到符合条件的数据,则返回第二个对象中
> # 相应数据的位置,否则返回NA。所以match后会返回一个与第一个对象长度相同的向量。
>
> # 找出符合人口数多于500万,同时月授予超5000美元的行数据,在对象myincomes.states中的位置,返回的向量数值即是要的结果。
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index
[1] 2 NA 4 NA NA NA 7 NA NA NA NA NA
>
> # 提取出myincome.states中人口数多于500万,同时月收入超5000美元的州的数据。
> myincomes.states[na.omit(my.index), ]
name Income
5 California 5114
13 Illinois 5107
30 New Jersey 5237
>
> # %in%将返回于第一个对象长度相同的逻辑向量,在向量中为T的元素是我们要的数据
> my.index2 <- mypopu.states$name %in% myincomes.states$name
> my.index2
[1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
>
> #抽出mypopu.states中人口多于500万,同时月收入过5000美元的州数据
> mypopu.states[my.index2, ]
name Population
5 California 21198
13 Illinois 11197
30 New Jersey 7333
>
> # 换种做法
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index3 <- !is.na(my.index) #my.index中不是NA的赋值给my.index3
> my.index3
[1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
> mypopu.states[my.index3, ]
name Population
5 California 21198
13 Illinois 11197
30 New Jersey 7333
排序
> # 排序sort/order
> # 数据框的排序,对state.info数据框依据Income字段执行升序排列。
> mystate.info <- data.frame(Region = state.region, state.x77)
> mystate.info
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Kansas North Central 2280 4669 0.6 72.58 4.5 59.9 114 81787
Kentucky South 3387 3712 1.6 70.10 10.6 38.5 95 39650
Louisiana South 3806 3545 2.8 68.76 13.2 42.2 12 44930
Maine Northeast 1058 3694 0.7 70.39 2.7 54.7 161 30920
Maryland South 4122 5299 0.9 70.22 8.5 52.3 101 9891
Massachusetts Northeast 5814 4755 1.1 71.83 3.3 58.5 103 7826
Michigan North Central 9111 4751 0.9 70.63 11.1 52.8 125 56817
Minnesota North Central 3921 4675 0.6 72.96 2.3 57.6 160 79289
Mississippi South 2341 3098 2.4 68.09 12.5 41.0 50 47296
Missouri North Central 4767 4254 0.8 70.69 9.3 48.8 108 68995
Montana West 746 4347 0.6 70.56 5.0 59.2 155 145587
Nebraska North Central 1544 4508 0.6 72.60 2.9 59.3 139 76483
Nevada West 590 5149 0.5 69.03 11.5 65.2 188 109889
New Hampshire Northeast 812 4281 0.7 71.23 3.3 57.6 174 9027
New Jersey Northeast 7333 5237 1.1 70.93 5.2 52.5 115 7521
New Mexico West 1144 3601 2.2 70.32 9.7 55.2 120 121412
New York Northeast 18076 4903 1.4 70.55 10.9 52.7 82 47831
North Carolina South 5441 3875 1.8 69.21 11.1 38.5 80 48798
North Dakota North Central 637 5087 0.8 72.78 1.4 50.3 186 69273
Ohio North Central 10735 4561 0.8 70.82 7.4 53.2 124 40975
Oklahoma South 2715 3983 1.1 71.42 6.4 51.6 82 68782
Oregon West 2284 4660 0.6 72.13 4.2 60.0 44 96184
Pennsylvania Northeast 11860 4449 1.0 70.43 6.1 50.2 126 44966
Rhode Island Northeast 931 4558 1.3 71.90 2.4 46.4 127 1049
South Carolina South 2816 3635 2.3 67.96 11.6 37.8 65 30225
South Dakota North Central 681 4167 0.5 72.08 1.7 53.3 172 75955
Tennessee South 4173 3821 1.7 70.11 11.0 41.8 70 41328
Texas South 12237 4188 2.2 70.90 12.2 47.4 35 262134
Utah West 1203 4022 0.6 72.90 4.5 67.3 137 82096
Vermont Northeast 472 3907 0.6 71.64 5.5 57.1 168 9267
Virginia South 4981 4701 1.4 70.08 9.5 47.8 85 39780
Washington West 3559 4864 0.6 71.72 4.3 63.5 32 66570
West Virginia South 1799 3617 1.4 69.48 6.7 41.6 100 24070
Wisconsin North Central 4589 4468 0.7 72.48 3.0 54.5 149 54464
Wyoming West 376 4566 0.6 70.29 6.9 62.9 173 97203
> head(mystate.info)
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
> state.info <- mystate.info[1:15, ]
> inc.order <- order(state.info$Income) # 默认升序
> state.info[inc.order, ]
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
>
> # 排序是增加次要键值,格式,order(主要健值,次要键值,……)
> # 以state.info 数据框为例,将Region作为主要健值,Income作为次要健值,升序排。
> inc.order2 <- order(state.info$Region, state.info$Income)
> state.info[inc.order2, ]
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
> # 在排序结果中south在northeast和north central之间,错了吗?这是由于state.region是一个因子,class()可知。
> # 对因子而言order的排序,相当于是执行levels排序,所以应该小心。
>
> # 混合排序。部分字段升序排,部分字段降序排,用xtfrm(),可将原向量转为数值向量,当想要以不同方式排序时,在xtfrm()前加上—即可
>
> #以state.info为例,将Region作为主要健值升序排,Income作次要健值降序排。
> mix.order <- order(state.info$Region, -xtfrm(state.info$Income))
> state.info[mix.order, ]
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
公式符号等
> # 公式符号,指的是统计学符号,基本的如下
> # y ~ a y是a的函数
> # y ~ a + b y是a和b的函数
> # y ~ a - b y是a的函数但排除b
>
> # 认识长格式数据(Long Format)与宽格式数据(Wide Format)
> # reshapes2扩展包的melt()函数/dcast()函数