R,笔记04

2018-12-16  本文已影响0人  按着易得

数据处理

> # 抽取数据去重复
> de_dup <- function()
+ {
+   i <- which(duplicated(iris))
+   x <- iris[-i, ]
+   #print(x)
+ }
> head(de_dup())
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
#或者
iris[!duplicated(iris), ]

去掉NA

> head(airquality[complete.cases(airquality), ])
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
7    23     299  8.6   65     5   7
8    19      99 13.8   59     5   8
> # 或者na.omit(airquality)

with() identical() within()函数

> # 用with()函数计算鸢尾花,花萼与花瓣的长度比
> rwith <- with(iris, Sepal.Length / Petal.Length)
> head(rwith)
[1] 3.642857 3.500000 3.615385 3.066667 3.571429 3.176471
> 
> # identical()基本作用是检测两个对象是否完全相同,相同返回T,否则,F
> 
> # within函数与with类似,但主要用于列运算,将运算结果放入新列
> myiris <- iris # 不破坏内建数据集
> myiris <- within(myiris, lenth.ratio <- Sepal.Length / Petal.Length)
> head(myiris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species lenth.ratio
1          5.1         3.5          1.4         0.2  setosa    3.642857
2          4.9         3.0          1.4         0.2  setosa    3.500000
3          4.7         3.2          1.3         0.2  setosa    3.615385
4          4.6         3.1          1.5         0.2  setosa    3.066667
5          5.0         3.6          1.4         0.2  setosa    3.571429
6          5.4         3.9          1.7         0.4  setosa    3.176471

分割数据

> # 分割数据
> # cut()将数据等量切割,处理后的数据是factor数据型态
> # 将state.77对象依人口数做分割,分成5等份
> popu <- state.x77[, "Population"]
> cutpopu <- cut(popu, 5)
> head(cutpopu)
[1] (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (1.7e+04,2.12e+04] (344,4.53e+03]    
Levels: (344,4.53e+03] (4.53e+03,8.7e+03] (8.7e+03,1.29e+04] (1.29e+04,1.7e+04] (1.7e+04,2.12e+04]

> #分割时,按人口数由多到少,分别给予名称"high" "2nd" "3rd" "4th" "low"
> cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
 [1] high high high high low  high high high 2nd  2nd  high high 3rd  2nd  high high high high high high 2nd  3rd  high high 2nd  high high high high 2nd  high low 
[33] 2nd  high 3rd  high high 3rd  high high high high 3rd  high high 2nd  high high 2nd  high
Levels: high 2nd 3rd 4th low
> 
> #要了解每一人口数分类有多少州
> x.popu <- cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
> table(x.popu)
x.popu
high  2nd  3rd  4th  low 
  34    9    5    0    2 

合并数据

准备数据库
> mystates.x77 <- as.data.frame(state.x77)
> mystates.x77$name <- rownames(state.x77) # 给新数据增加一个字段name
> head(mystates.x77)
           Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
California      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado
> row.names(mystates.x77) <- NULL # 删除原来行名
> head(mystates.x77)
  Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
1       3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
2        365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
3       2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
4       2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
5      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
6       2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado

> #人口大于500万的选出来(原单位是千人数),同时新对象要有2个字段name 和 population
> mypopu.states <- mystates.x77[mystates.x77$Population > 5000, c("name", "Population")]
> mypopu.states
             name Population
5      California      21198
9         Florida       8277
13       Illinois      11197
14        Indiana       5313
21  Massachusetts       5814
22       Michigan       9111
30     New Jersey       7333
32       New York      18076
33 North Carolina       5441
35           Ohio      10735
38   Pennsylvania      11860
43          Texas      12237

> #选出月收入大于5000美元的。同时新对象要有2个字段name 和 Income
> myincomes.states <- mystates.x77[mystates.x77$Income > 5000, c("name", "Income")]
> myincomes.states
           name Income
2        Alaska   6315
5    California   5114
7   Connecticut   5348
13     Illinois   5107
20     Maryland   5299
28       Nevada   5149
30   New Jersey   5237
34 North Dakota   5087

merge

> # merge()交集合并。merge(x, y, all = F),默认是交接合并
> # 合并上述两个数据中人数超500万的州和月收入超5000美元的州
> merge(mypopu.states, myincomes.states)
        name Population Income
1 California      21198   5114
2   Illinois      11197   5107
3 New Jersey       7333   5237
> 
> # 取并集
> merge(mypopu.states, myincomes.states, all = T)
             name Population Income
1          Alaska         NA   6315
2      California      21198   5114
3     Connecticut         NA   5348
4         Florida       8277     NA
5        Illinois      11197   5107
6         Indiana       5313     NA
7        Maryland         NA   5299
8   Massachusetts       5814     NA
9        Michigan       9111     NA
10         Nevada         NA   5149
11     New Jersey       7333   5237
12       New York      18076     NA
13 North Carolina       5441     NA
14   North Dakota         NA   5087
15           Ohio      10735     NA
16   Pennsylvania      11860     NA
17          Texas      12237     NA
> 
> # merge参数all.x = T, 保证第一个对象的元素在合并中都存在,第二个如没有则NA填充
> merge(mypopu.states, myincomes.states, all.x = T)
             name Population Income
1      California      21198   5114
2         Florida       8277     NA
3        Illinois      11197   5107
4         Indiana       5313     NA
5   Massachusetts       5814     NA
6        Michigan       9111     NA
7      New Jersey       7333   5237
8        New York      18076     NA
9  North Carolina       5441     NA
10           Ohio      10735     NA
11   Pennsylvania      11860     NA
12          Texas      12237     NA

match

> # match()类似于取两个对象交集,即第一对象x的某行数据若在第二个对象y中找到符合条件的数据,则返回第二个对象中
> # 相应数据的位置,否则返回NA。所以match后会返回一个与第一个对象长度相同的向量。
> 
> # 找出符合人口数多于500万,同时月授予超5000美元的行数据,在对象myincomes.states中的位置,返回的向量数值即是要的结果。
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index
 [1]  2 NA  4 NA NA NA  7 NA NA NA NA NA
> 
> # 提取出myincome.states中人口数多于500万,同时月收入超5000美元的州的数据。
> myincomes.states[na.omit(my.index), ]
         name Income
5  California   5114
13   Illinois   5107
30 New Jersey   5237
> 
> # %in%将返回于第一个对象长度相同的逻辑向量,在向量中为T的元素是我们要的数据
> my.index2 <- mypopu.states$name %in% myincomes.states$name
> my.index2
 [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
> 
> #抽出mypopu.states中人口多于500万,同时月收入过5000美元的州数据
> mypopu.states[my.index2, ]
         name Population
5  California      21198
13   Illinois      11197
30 New Jersey       7333
> 
> # 换种做法
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index3 <- !is.na(my.index) #my.index中不是NA的赋值给my.index3
> my.index3
 [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
> mypopu.states[my.index3, ]
         name Population
5  California      21198
13   Illinois      11197
30 New Jersey       7333

排序

> # 排序sort/order
> # 数据框的排序,对state.info数据框依据Income字段执行升序排列。
> mystate.info <- data.frame(Region = state.region, state.x77)
> mystate.info
                      Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Alabama                South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska                  West        365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona                 West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas               South       2110   3378        1.9    70.66   10.1    39.9    65  51945
California              West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado                West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Connecticut        Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Delaware               South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida                South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Georgia                South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Hawaii                  West        868   4963        1.9    73.60    6.2    61.9     0   6425
Idaho                   West        813   4119        0.6    71.87    5.3    59.5   126  82677
Illinois       North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Indiana        North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Iowa           North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Kansas         North Central       2280   4669        0.6    72.58    4.5    59.9   114  81787
Kentucky               South       3387   3712        1.6    70.10   10.6    38.5    95  39650
Louisiana              South       3806   3545        2.8    68.76   13.2    42.2    12  44930
Maine              Northeast       1058   3694        0.7    70.39    2.7    54.7   161  30920
Maryland               South       4122   5299        0.9    70.22    8.5    52.3   101   9891
Massachusetts      Northeast       5814   4755        1.1    71.83    3.3    58.5   103   7826
Michigan       North Central       9111   4751        0.9    70.63   11.1    52.8   125  56817
Minnesota      North Central       3921   4675        0.6    72.96    2.3    57.6   160  79289
Mississippi            South       2341   3098        2.4    68.09   12.5    41.0    50  47296
Missouri       North Central       4767   4254        0.8    70.69    9.3    48.8   108  68995
Montana                 West        746   4347        0.6    70.56    5.0    59.2   155 145587
Nebraska       North Central       1544   4508        0.6    72.60    2.9    59.3   139  76483
Nevada                  West        590   5149        0.5    69.03   11.5    65.2   188 109889
New Hampshire      Northeast        812   4281        0.7    71.23    3.3    57.6   174   9027
New Jersey         Northeast       7333   5237        1.1    70.93    5.2    52.5   115   7521
New Mexico              West       1144   3601        2.2    70.32    9.7    55.2   120 121412
New York           Northeast      18076   4903        1.4    70.55   10.9    52.7    82  47831
North Carolina         South       5441   3875        1.8    69.21   11.1    38.5    80  48798
North Dakota   North Central        637   5087        0.8    72.78    1.4    50.3   186  69273
Ohio           North Central      10735   4561        0.8    70.82    7.4    53.2   124  40975
Oklahoma               South       2715   3983        1.1    71.42    6.4    51.6    82  68782
Oregon                  West       2284   4660        0.6    72.13    4.2    60.0    44  96184
Pennsylvania       Northeast      11860   4449        1.0    70.43    6.1    50.2   126  44966
Rhode Island       Northeast        931   4558        1.3    71.90    2.4    46.4   127   1049
South Carolina         South       2816   3635        2.3    67.96   11.6    37.8    65  30225
South Dakota   North Central        681   4167        0.5    72.08    1.7    53.3   172  75955
Tennessee              South       4173   3821        1.7    70.11   11.0    41.8    70  41328
Texas                  South      12237   4188        2.2    70.90   12.2    47.4    35 262134
Utah                    West       1203   4022        0.6    72.90    4.5    67.3   137  82096
Vermont            Northeast        472   3907        0.6    71.64    5.5    57.1   168   9267
Virginia               South       4981   4701        1.4    70.08    9.5    47.8    85  39780
Washington              West       3559   4864        0.6    71.72    4.3    63.5    32  66570
West Virginia          South       1799   3617        1.4    69.48    6.7    41.6   100  24070
Wisconsin      North Central       4589   4468        0.7    72.48    3.0    54.5   149  54464
Wyoming                 West        376   4566        0.6    70.29    6.9    62.9   173  97203
> head(mystate.info)
           Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Alabama     South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska       West        365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona      West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas    South       2110   3378        1.9    70.66   10.1    39.9    65  51945
California   West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado     West       2541   4884        0.7    72.06    6.8    63.9   166 103766
> state.info <- mystate.info[1:15, ]
> inc.order <- order(state.info$Income) # 默认升序
> state.info[inc.order, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
> 
> # 排序是增加次要键值,格式,order(主要健值,次要键值,……)
> # 以state.info 数据框为例,将Region作为主要健值,Income作为次要健值,升序排。
> inc.order2 <- order(state.info$Region, state.info$Income)
> state.info[inc.order2, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
> # 在排序结果中south在northeast和north central之间,错了吗?这是由于state.region是一个因子,class()可知。
> # 对因子而言order的排序,相当于是执行levels排序,所以应该小心。
> 
> # 混合排序。部分字段升序排,部分字段降序排,用xtfrm(),可将原向量转为数值向量,当想要以不同方式排序时,在xtfrm()前加上—即可
> 
> #以state.info为例,将Region作为主要健值升序排,Income作次要健值降序排。
> mix.order <- order(state.info$Region, -xtfrm(state.info$Income))
> state.info[mix.order, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677

公式符号等

> # 公式符号,指的是统计学符号,基本的如下
> # y ~ a y是a的函数
> # y ~ a + b y是a和b的函数
> # y ~ a - b y是a的函数但排除b
> 
> # 认识长格式数据(Long Format)与宽格式数据(Wide Format)
> # reshapes2扩展包的melt()函数/dcast()函数

上一篇下一篇

猜你喜欢

热点阅读