[R语言] stringr包 字符处理《R for data s
《R for Data Science》第十四章 strings 啃书知识点积累
参考链接:R for Data Science涉及部分:
stringr
非正则相关内容 +stringi
正则匹配会单独划分一个专题做笔记
参考书籍
- 《R for data science》
- 《R数据科学》
String basics
"
or '
: There is no difference in behaviour.
- 字符串中有引号的处理
使用反义\
或者不同的引号组合
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
- 转义用
print
依然会呈现,可以用writelines
显示转义后的文字效果
x <- "hello\\world"
x
# [1] "hello\\world"
writeLines(x)
# hello\world
# 另一个例子
x <- "hello\tworld"
x
# [1] "hello\tworld"
writeLines(x)
# hello world
- utf-8类型非英文字符的处理
x <- "\u00b5"
x
#> [1] "µ"
- String length
- 关于
stringr
包的函数
These have more intuitive names, and all start with
str_
.
str_length(c("a", "R for data science", NA))
# [1] 1 18 NA
- Combining strings
str_c("x", "y")
#> [1] "xy"
str_c("x", "y", "z")
#> [1] "xyz"
# 符合向量化运算
str_c("prefix-", c("a", "b", "c"), "-suffix")
# [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
# 也可以使用paste0
paste0('x','y')
# [1] "xy"
# sep参数控制连接的分隔符,默认为空
str_c("x", "y", sep = ", ")
#> [1] "x, y"
- 处理带有NA的向量
NA有contagious
的特性
x <- c("abc", NA)
str_c("|-", x, "-|")
# [1] "|-abc-|" NA
# 可以使用str_replace_na()保留NA形式输出
str_c("|-", str_replace_na(x), "-|")
# [1] "|-abc-|" "|-NA-|"
- 在
str_c
中添加条件判断
name <- "Xi Chen"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
# [1] "Good morning Xi Chen."
birthday <- TRUE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
# [1] "Good morning Xi Chen and HAPPY BIRTHDAY."
- 将含多个字符串的向量合并成一个字符串
str_c(c("x", "y", "z"), collapse = ", ")
# [1] "x, y, z"
str_c(c("x", "y", "z"), collapse = "")
# [1] "xyz"
- Subsetting strings
str_sub
: 第一个参数是向量,第二个参数是提取的起始位置,第三个为终末位置
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"
- 终末位置超出量程不会error,会尽可能返回多的元素
str_sub("a", 1, 5)
# [1] "a"
str_sub(c('xichen','Oct','hi'), 1, 3)
# [1] "xic" "Oct" "hi"
- 结合
str_to_lower
可以将首字母替换为小写
x <- c("APple", "BaNana", "PeaR");x
# [1] "APple" "BaNana" "PeaR"
str_to_lower(x)
# [1] "apple" "banana" "pear"
# 这个赋值很关键,会通过子集替换全局
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1));x
# [1] "aPple" "baNana" "peaR"
- Locales
这个参数我貌似用不上
str_to_lower()
str_to_upper()
str_to_title()
- locale可以指定语言环境
str_to_upper(c("i", "ı"))
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
#> [1] "İ" "I"
- 关于排序中
locale
的作用
The base R
order()
andsort()
functions sort strings using the current locale.
str_sort
可以依据locale
的语言环境进行排序
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
#> [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") # Hawaiian
#> [1] "apple" "eggplant" "banana"
- 其他函数
(1) str_wrap
This is a wrapper around
stringi::stri_wrap()
which implements the Knuth-Plass paragraph wrapping algorithm.
str_wrap(string, width = 80, indent = 0, exdent = 0)
str_wrap
帮助文档中有word
函数,先来学习一下
sentences <- c("Jane saw a cat", "Jane sat down")
word(sentences, 1)
# [1] "Jane" "Jane"
word(sentences, 2)
# [1] "saw" "sat"
word(sentences, -1)
# [1] "cat" "down"
word(sentences, 2, -1)
# [1] "saw a cat" "sat down"
word(sentences[1], 1:3, -1)
# [1] "Jane saw a cat" "saw a cat" "a cat"
word(sentences[1], 1, 1:4)
# [1] "Jane" "Jane saw" "Jane saw a" "Jane saw a cat"
str <- 'abc.def..123.4568.999'
word(str, 6, sep = fixed('.'))
# [1] "999"
word(str, 2, sep = fixed('..'))
# [1] "123.4568.999"
# str_wrap控制文字段落的显示方式
# 有空再研究一下cat函数
library(stringr)
?str_wrap
thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1,1, fixed("\n\n"))
cat(str_wrap(thanks), "\n")
cat(str_wrap(thanks, width = 40), "\n")
cat(str_wrap(thanks, width = 60, indent = 2), "\n")
cat(str_wrap(thanks, width = 60, exdent = 2), "\n")
cat(str_wrap(thanks, width = 0, exdent = 2), "\n")
(2) str_trim
和str_pad
# str_trim可以去除字符串两边的空白
str_trim(" XiChen ", side = 'both') # 默认是双边去除
# [1] "XiChen"
str_trim(" XiChen ", side = "left")
# [1] "XiChen "
str_trim(" XiChen ", side = "right")
# [1] " XiChen"
# str_pad可以有str_trim相反的功能,且功能更丰富
str_pad('XiChen', 8, 'both') # pad默认是空格,可以指定/side默认是left
# [1] " XiChen "
str_pad('XiChen', 8, 'left', pad = '?')
# [1] "??XiChen"
str_pad("a", 2, pad = c("-", "_", " "))
# [1] "-a" "_a" " a"
- Exercises
(1) paste
paste0
str_c
处理一般向量和含NA向量的异同
paste("Xi", "Chen")
# [1] "Xi Chen"
paste0("Xi", "Chen")
# [1] "XiChen"
# 注意添加连接符对paste0的影响:仅体现在末尾
paste("Xi", "Chen",sep=',')
# [1] "Xi,Chen"
paste0("Xi", "Chen",sep=',')
# [1] "XiChen,"
# paste等价于str_c(var,sep=' ')
# paste0等价于str_c(var)
str_c("Xi", "Chen")
# [1] "XiChen"
str_c("Xi", "Chen",sep=' ')
# [1] "Xi Chen"
针对NA的异同
# paste paste0都可以直接将NA拼接入字符串
paste("Xi", "Chen", NA)
# [1] "Xi Chen NA"
paste0("Xi", "Chen", NA)
# [1] "XiChenNA"
str_c("Xi", "Chen", NA)
# NA
str_c("Xi", "Chen", str_replace_na(NA))
# [1] "XiChenNA"
x <- c("Xi", "Chen", NA)
str_c(x,collapse = '')
# NA
str_c(str_replace_na(x), collapse = '')
# [1] "XiChenNA"
(2) str_c
中sep
和collapse
的区别
sep对多个向量的拼合起作用
collap对单个向量内部拼接起作用
x <- c('Xi','Chen')
str_c(x)
# [1] "Xi" "Chen"
str_c(x, sep = ',')
# [1] "Xi" "Chen"
str_c(x, collapse = ',')
# [1] "Xi,Chen"
str_c('Xi','Chen')
# [1] "XiChen"
str_c('Xi','Chen', sep = ',')
# [1] "Xi,Chen"
str_c('Xi','Chen', collapse = ',')
# [1] "XiChen"
# 这个例子更能说明seq和collapse的作用范围和区别
str_c('x',c('y','z'))
# [1] "xy" "xz"
str_c('x',c('y','z'),sep = ',')
# [1] "x,y" "x,z"
str_c('x',c('y','z'),collapse = ',')
# [1] "xy,xz"
stringi
stringr
is built on top of thestringi
package.stringi
has 244 functions tostringr
’s 49.- The main difference is the prefix:
str_
vs.stri_
.
- Exercises
(1) stringi
几种特殊用法
library(stringi)
# 计算单词数
x <- c('Xi Chen','Happy birthday to you')
stri_count_words(x)
# [1] 2 4
# 判断查找重复字符串
x <- c('Xi Chen', 'XiChen', 'apple', 'Xi Chen','apple')
stri_duplicated(x)
# [1] FALSE FALSE FALSE TRUE TRUE
stri_duplicated_any(x)
# [1] 4
# 返回第一个重复元素的下标
# 产生随机字符串
stri_rand_strings(4, 3) # 4个含3字符的字符串
# [1] "Ebd" "JDg" "3Kf" "8k6"
# 现有字符串随机重排
stri_rand_shuffle(c('Xi Chen','Happy birthday to you'))
# [1] "nX eChi" "byHur oiyd hap ptoyta"
# 乱数假文
stri_rand_lipsum(4) # 数字表示产生几段假文
(2) stri_sort
涉及语言环境的设置
我很少用估计
stri_sort(c("hladny", "chladny"), locale="pl_PL")
# [1] "chladny" "hladny"
stri_sort(c("hladny", "chladny"), locale="sk_SK")
# [1] "hladny" "chladny"