R for data science ||使用readr进行数据
2019-07-19 本文已影响11人
周运来就是我
使用R包提供的数据是学习数据科学工具的好方法,但是在某个时候,您希望停止学习,开始使用自己的数据。在本章中,您将学习如何将纯文本矩形文件读入r。在这里,我们只讨论数据导入的皮毛,但是许多原则将转换为其他形式的数据。
library(tidyverse)
setwd("D:\\Users\\Administrator\\Desktop\\RStudio\\R-Programming")
heights <- read_csv("heights.csv")
Parsed with column specification:
cols(
earn = col_double(),
height = col_double(),
sex = col_character(),
ed = col_double(),
age = col_double(),
race = col_character()
)
?read_csv()
? read_csv2()
? read_tsv()
? read_delim()
?read_fwf()
?read_log()
直接创建行内csv文件。
read_csv("a,b,c
1,2,3
4,5,6")
# A tibble: 2 x 3
a b c
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
用skip=n来跳过前n行。
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
# A tibble: 1 x 3
x y z
<dbl> <dbl> <dbl>
1 1 2 3
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
# A tibble: 1 x 3
x y z
<dbl> <dbl> <dbl>
1 1 2 3
无列名
read_csv("1,2,3\n4,5,6", col_names = FALSE)
# A tibble: 2 x 3
X1 X2 X3
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
# A tibble: 2 x 3
x y z
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
与R基础包进行比较
- 速度更快
- 可以生成tibble,不会将字符串向量转化为因子,不使用行名称,也不会随意改变列名称。
- 更易于重复使用。
解析向量
str(parse_logical(c("TRUE", "FALSE", "NA")))
#> logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
#> int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
#> Date[1:2], format: "2010-01-01" "1979-10-14"
str(parse_integer(c("1", "2", "a")))
Warning: 1 parsing failure.
row col expected actual
3 -- an integer a
int [1:3] 1 2 NA
- attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1 obs. of 4 variables:
..$ row : int 3
..$ col : int NA
..$ expected: chr "an integer"
..$ actual : chr "a"
数值
parse_double("1.23")
#> [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
#> [1] 1.23
parse_number("$100")
#> [1] 100
parse_number("20%")
#> [1] 20
parse_number("It cost $123.45")
#> [1] 123
# Used in America
parse_number("$123,456,789")
#> [1] 1.23e+08
# Used in many parts of Europe
parse_number("123.456.789", locale = locale(grouping_mark = "."))
#> [1] 1.23e+08
# Used in Switzerland
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
#> [1] 1.23e+08
字符串
#In R, we can get at the underlying representation of a string using charToRaw():
charToRaw("Hadley")
#> [1] 48 61 64 6c 65 79
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
x1
#> [1] "El Ni\xf1o was particularly bad this year"
x2
#> [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
parse_character(x1, locale = locale(encoding = "Latin1"))
#> [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
#> [1] "こんにちは"
查看编码格式
guess_encoding(charToRaw(x1))
#> # A tibble: 2 x 2
#> encoding confidence
#> <chr> <dbl>
#> 1 ISO-8859-1 0.46
#> 2 ISO-8859-9 0.23
guess_encoding(charToRaw(x2))
#> # A tibble: 1 x 2
#> encoding confidence
#> <chr> <dbl>
#> 1 KOI8-R 0.42
因子
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
#> Warning: 1 parsing failure.
#> row col expected actual
#> 3 -- value in level set bananana
#> [1] apple banana <NA>
#> attr(,"problems")
#> # A tibble: 1 x 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA value in level set bananana
#> Levels: apple banana
时间
parse_datetime("2010-10-01T2010")
#> [1] "2010-10-01 20:10:00 UTC"
# If time is omitted, it will be set to midnight
parse_datetime("20101010")
#> [1] "2010-10-10 UTC"
parse_date("2010-10-01")
#> [1] "2010-10-01"
library(hms)
parse_time("01:10 am")
#> 01:10:00
parse_time("20:10:01")
#> 20:10:01
parse_date("01/02/15", "%m/%d/%y")
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
#> [1] "2001-02-15"
parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
#> [1] "2015-01-01"
解析文件
既然您已经了解了如何解析单个向量,现在就回到开始部分,研究readr如何解析文件。在本节中,您将了解两个新内容:
- readr如何自动猜测每个列的类型。
- 如何修改默认值。
启发式
guess_parser("2010-10-01")
#> [1] "date"
guess_parser("15:01")
#> [1] "time"
guess_parser(c("TRUE", "FALSE"))
#> [1] "logical"
guess_parser(c("1", "5", "9"))
#> [1] "double"
guess_parser(c("12,352,561"))
#> [1] "number"
str(parse_guess("2010-10-10"))
#> Date[1:1], format: "2010-10-10"
challenge <- read_csv(readr_example("challenge.csv"))
Parsed with column specification:
cols(
x = col_double(),
y = col_logical()
)
Warning: 1000 parsing failures.
row col expected actual file
1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
.... ... .................. .......... ................................................
See problems(...) for more details.
有两个打印输出:查看前1000行生成的列规范和前5个解析失败。显式地找出问题()总是一个好主意,这样您就可以更深入地研究它们:
problems(challenge)
# A tibble: 1,000 x 5
row col expected actual file
<int> <chr> <chr> <chr> <chr>
1 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
2 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
3 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
4 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
5 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
6 1006 y 1/0/T/F/TRUE/FALSE 2016-04-17 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
7 1007 y 1/0/T/F/TRUE/FALSE 2011-05-14 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
8 1008 y 1/0/T/F/TRUE/FALSE 2020-07-18 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
9 1009 y 1/0/T/F/TRUE/FALSE 2011-04-30 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
10 1010 y 1/0/T/F/TRUE/FALSE 2010-05-11 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
# ... with 990 more rows
一个好的策略是逐列工作,直到没有问题为止。这里我们可以看到x列有很多解析问题——整数值后面有尾随字符。这意味着我们需要使用双解析器。
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_integer(),
y = col_character()
)
)
Warning: 1000 parsing failures.
row col expected actual file
1001 x no trailing characters .23837975086644292 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1002 x no trailing characters .41167997173033655 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1003 x no trailing characters .7460716762579978 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1004 x no trailing characters .723450553836301 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
1005 x no trailing characters .614524137461558 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
.... ... ...................... .................. ................................................
See problems(...) for more details.
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_character()
)
)
tail(challenge)
# A tibble: 6 x 2
x y
<dbl> <chr>
1 0.805 2019-11-21
2 0.164 2018-03-29
3 0.472 2014-08-04
4 0.718 2015-08-16
5 0.270 2020-02-04
6 0.608 2019-01-06
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_date()
)
)
tail(challenge)
#> # A tibble: 6 x 2
#> x y
#> <dbl> <date>
#> 1 0.805 2019-11-21
#> 2 0.164 2018-03-29
#> 3 0.472 2014-08-04
#> 4 0.718 2015-08-16
#> 5 0.270 2020-02-04
#> 6 0.608 2019-01-06
challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
#> Parsed with column specification:
#> cols(
#> x = col_double(),
#> y = col_date(format = "")
#> )
challenge2
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
challenge2
# A tibble: 2,000 x 2
x y
<chr> <chr>
1 404 NA
2 4172 NA
3 3004 NA
4 787 NA
5 37 NA
6 2332 NA
7 2489 NA
8 1449 NA
9 3665 NA
10 3863 NA
# ... with 1,990 more rows
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
#> # A tibble: 3 x 2
#> x y
#> <chr> <chr>
#> 1 1 1.21
#> 2 2 2.32
#> 3 3 4.56
# Note the column types
type_convert(df)
#> Parsed with column specification:
#> cols(
#> x = col_double(),
#> y = col_double()
#> )
#> # A tibble: 3 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 1.21
#> 2 2 2.32
#> 3 3 4.56
文件写出
readr还提供了两个将数据写入磁盘的有用函数:write_csv()和write_tsv()。这两个函数都增加了输出文件被正确读入的机会:
- 总是用UTF-8编码字符串。
- 以ISO8601格式保存日期和日期时间,以便在其他地方轻松解析。
write_csv(challenge, "challenge.csv")
challenge
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")
#> Parsed with column specification:
#> cols(
#> x = col_double(),
#> y = col_logical()
#> )
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <lgl>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
feather包实现了一种快速的二进制文件格式,可以跨编程语言共享:
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 <NA>
#> 2 4172 <NA>
#> 3 3004 <NA>
#> 4 787 <NA>
#> 5 37 <NA>
#> 6 2332 <NA>
#> # ... with 1,994 more rows