R语言 生信R

整理stringr包-妈妈再也不担心我不会用stringr啦

2019-12-21  本文已影响0人  小梦游仙境

1.str_dup 复制字符串

函数定义:str_dup(string, times)
参数列表:

string:需要重复处理的字符串

times:指定重复的次数

# 1.str_dup
> fruit<-c('apple','pear','banana')
> str_dup(fruit,2)
[1] "appleapple"   "pearpear"     "bananabanana"
> str_dup(fruit,2:4)
[1] "appleapple"               "pearpearpear"            
[3] "bananabananabananabanana"
> #下面是对循环补齐的理解
> str_dup(fruit,2:5)
[1] "appleapple"                "pearpearpear"             
[3] "bananabananabananabanana"  "appleappleappleappleapple"
Warning message:
In stri_dup(string, times) :
  longer object length is not a multiple of shorter object length
> #结合刚学的str_c
> str_c('Fresh ',str_dup(fruit,0:3))
[1] "Fresh "                "Fresh pear"           
[3] "Fresh bananabanana"    "Fresh appleappleapple"
Warning message:
In stri_dup(string, times) :
  longer object length is not a multiple of shorter object length

2.str_pad 补充字符串的长度

函数定义:str_pad(string, width, side = c("left", "right", "both"), pad = " ")
参数列表:
string: 字符串,字符串向量。
width: 字符串填充后的长度
side: 填充方向,both两边都填充,left左边填充,right右边填充
pad: 用于填充的字符,不加pad参数时默认是空格

# 2.str_pad 
> rbind(str_pad("hadley", 10, side = "left"),
+       str_pad("hadley", 10, side = "right"),
+       str_pad("hadley", 10, side = "both"))
     [,1]        
[1,] "    hadley"
[2,] "hadley    "
[3,] "  hadley  "
> #不加side时默认是从左边left增加空格
> str_pad(c("A", "abc", "abcdef"), 8)
[1] "       A" "     abc" "  abcdef"
> str_pad("A", c(3, 6, 10))
[1] "  A"        "     A"     "         A"
> str_pad("A", 10, pad = c("!", "*", " "),side = 'right')
[1] "A!!!!!!!!!" "A*********" "A         "

3.str_trim 去掉字符串的空格和TAB(\t)

函数定义:str_trim(string, side = c("both", "left", "right"))
参数列表:
string: 字符串,字符串向量。
side: 过滤方式,both两边都过滤,left左边过滤,right右边过滤
去掉字符串的空格和TAB(\t)

# 3.str_trim 
> string<- '   I love biotree   '
> str_trim(string)
[1] "I love biotree"
> str_trim(string,side = 'left')
[1] "I love biotree   "
> str_trim(string,side = 'right')
[1] "   I love biotree"
> str_trim(string,side = 'both')
[1] "I love biotree"

4.str_count 字符串计数

函数定义:str_count(string, pattern = "")

参数列表:

# 4.str_count
> fruit <- c("apple", "banana", "pear", "pineapple")
> str_count(fruit, "a")
[1] 1 3 1 1
> str_count(fruit, c("a", "b", "p", "p"))
[1] 1 1 1 3
> str_count(c("a.", "...", ".a.a"), ".")
[1] 2 3 4
> str_count(c("a.", "...", ".a.a"), fixed("."))
[1] 1 3 2

5.str_length 字符串长度计数

函数定义:str_length(string)
参数列表:
string: 字符串,字符串向量。

str_length(),字符长度函数,该函数类似于nchar()函数,但前者将NA返回为NA,而nchar则返回2

# 5.str_length
> str_length(letters)
 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
> str_length(c("i", "like", "programming", NA))
[1]  1  4 11 NA

6.str_sort 字符串值排序

str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "", ...)
str_order(x, decreasing = FALSE, na_last = TRUE, locale = "", ...)

str_order和str_sort的区别在于前者返回排序后的索引(下标),后者返回排序后的实际值
参数列表:
x: 字符串,字符串向量。
decreasing: 排序方向。
na_last:NA值的存放位置,一共3个值,TRUE放到最后,FALSE放到最前,NA过滤处理
locale:按哪种语言习惯排序

# 6.str_sort 
> str_sort(letters)
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t"
[21] "u" "v" "w" "x" "y" "z"
> str_sort(c('wo','love','biotree','forever'),locale = "en")
[1] "biotree" "forever" "love"    "wo"     
> str_sort(c('wo','love','biotree','forever'),locale = "en",decreasing = T)
[1] "wo"      "love"    "forever" "biotree"
> str_sort(c('我','爱','生','信','技','能','树'),locale = "zh")
[1] "爱" "技" "能" "生" "树" "我" "信"
> str_sort(c('我','爱','生','信','技','能','树'),locale = "zh",decreasing = T)
[1] "信" "我" "树" "生" "能" "技" "爱"
> #str_order
> str_order(c('wo','love','biotree','forever'),locale = "en")
[1] 3 4 2 1
> str_order(c('wo','love','biotree','forever'),locale = "en",decreasing = T)
[1] 1 2 4 3

7.str_c 字符串连接

==相当于paste/paste0==

函数定义:
str_c(..., sep = "", collapse = NULL)
str_join(..., sep = "", collapse = NULL)
参数列表:
…: 多参数的输入
sep: 把多个字符串拼接为一个大的字符串,用于字符串的分割符。
collapse: 把多个向量参数拼接为一个大的字符串,用于字符串的分割符。

# 7.str_c 
#str_c与paste0类似  
> str_c(letters)
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
[18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
> str_c('a','b')
[1] "ab"
> str_c('a','b',sep = '-')
[1] "a-b"
> str_c('a','b',collapse = "-")
[1] "ab"
> str_c('a','b','c')
[1] "abc"
> str_c('a','b','c',sep  = "-")
[1] "a-b-c"
> str_c('a','b','c',collapse = "-")
[1] "abc"
> str_c(c('a','a1'),c('b','b1'),sep='-')
[1] "a-b"   "a1-b1"
> str_c(c('a','a1'),c('b','b1'),collapse='-')
[1] "ab-a1b1"
> #下面str_c(head(letters), collapse = '-')与str_c('a','b',collapse = "-")没有什么不同呀,超脱我的理解,并且加了sep参数竟三个结果都一样,不按套路出牌,所以只能格外理解。
> str_c(head(letters), collapse = "")
[1] "abcdef"
> str_c(head(letters), collapse = " ")
[1] "a b c d e f"
> str_c(head(letters), collapse = '-')
[1] "a-b-c-d-e-f"
> str_c(head(letters), sep = "")
[1] "a" "b" "c" "d" "e" "f"
> str_c(head(letters), sep = " ")
[1] "a" "b" "c" "d" "e" "f"
> str_c(head(letters), sep = '-')
[1] "a" "b" "c" "d" "e" "f"
> #下面是对循环补齐的理解
> str_c("Letter: ", letters[1:10])#与paste0相同,默认分隔符是无
 [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
 [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"
> str_c("Letter", letters[1:10], sep = ": ")
 [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
 [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"
> #下面是三个字符串拼接,如果什么参数都没有,那么就是直接无缝连接,因为str_c同paste0相同,同时注意理解循环补齐
> str_c(letters[1:5], " is for", "...")
[1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..."
> #str_c和paste的不同之处之一
> str_c('a','b') #str_c把多个字符串拼接为一个大的字符串,默认无间隙。
[1] "ab"
> paste('a','b') #paste默认sep是' '
[1] "a b"
> #str_c和paste的不同之处之二
> #str_c要连接的两个向量如果含NA,则无法连接
> #paste要连接的两个向量即使含有NA,也可连接
> str_c(c("a", NA, "b"), "-d")
[1] "a-d" NA    "b-d"
> paste(c("a", NA, "b"), "-d")
[1] "a -d"  "NA -d" "b -d" 
> #加了str_replace_na函数,str_c也可以连接含有NA的向量
> str_c(str_replace_na(c("a", NA, "b")), "-d") #即使空,也可连接
[1] "a-d"  "NA-d" "b-d" 
> str_c('a','b','c',collapse = "-")
[1] "abc"

8.str_sub 截取字符串

==相当于substr/substring==

函数定义:str_sub(string, start = 1L, end = -1L)

参数列表:

str_sub(string, start = 1L, end = -1L) 提取子字符串

str_sub(string, start = 1L, end = -1L) <- value 替换子字符串

# 8.str_sub  
> #str_sub与substr()类似
> #例一 截取字符串
> jns <- 'sheng xin ji neng shu'
> str_sub(jns,1,5)
[1] "sheng"
> str_sub(jns, 6) #默认从坐标位置6开始截取
[1] " xin ji neng shu"
> str_sub(jns, end=6)
[1] "sheng "
> str_sub(jns, -3) # 通过负坐标截取字符串
[1] "shu"
> str_sub(jns, end = -3)
[1] "sheng xin ji neng s"
> str_sub(jns,c(1,4),c(2,6))
[1] "sh"  "ng "
> str_sub(jns,c(1,4),c(2,6,8))
[1] "sh"       "ng "      "sheng xi"
Warning message:
In stri_sub(string, from = start, to = end) :
  longer object length is not a multiple of shorter object length
> str_sub(jns,1,1)<-'S'
> jns
[1] "Sheng xin ji neng shu"
> #例二 截取字符串后更改
> x <- "AAABBBCCC" #对截取的字符串进行赋值。
> str_sub(x, 1, 1) <- 1; x ## 在字符串的1的位置赋值为1
[1] "1AABBBCCC"
> str_sub(x, 2, -2) <- "2345"; x ## 在字符串从2到-2的位置赋值为2345
[1] "12345C"

9.str_replace 字符串替换

==相当于gsub/sub==

函数定义:str_replace(string, pattern, replacement)
参数列表:
string: 字符串,字符串向量。
pattern: 匹配字符。
replacement: 用于替换的字符。

# 9.str_replace  
> #str_replace,字符串替换
> fruits<-c('one apple','two pears','three bananas')
> str_replace(fruits,'[aeiou]','-')
[1] "-ne apple"     "tw- pears"     "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
> str_replace_all(fruits, "[aeiou]", toupper)
[1] "OnE ApplE"     "twO pEArs"     "thrEE bAnAnAs"
> str_replace_all(fruits, "b", NA_character_)
[1] "one apple" "two pears" NA         
> str_replace(fruits, "([aeiou])", "")
[1] "ne apple"     "tw pears"     "thre bananas"
> str_replace(fruits, "[aeiou]", c("1", "2", "3"))
[1] "1ne apple"     "tw2 pears"     "thr3e bananas"
> str_replace(fruits, c("a", "e", "i"), "-")
[1] "one -pple"     "two p-ars"     "three bananas"
> # 管道符应用
> fruits %>%
+   str_c(collapse = "---") %>%
+   str_replace_all(c("one" = "1", "two" = "2", "three" = "3"))
[1] "1 apple---2 pears---3 bananas"

10.str_split 字符串分割

==相当于strsplit==

函数定义:

str_split(string, pattern, n = Inf)
str_split_fixed(string, pattern, n)
参数列表:
string: 字符串,字符串向量。
pattern: 匹配的字符。
n: 分割个数 #最后一组就不会被分割
对字符串进行分割。

# 10.str_split  
> fruits <- c("apples and oranges and pears and bananas",
+             "pineapples and mangos and guavas")
> str_split(fruits, " and ")
[[1]]
[1] "apples"  "oranges" "pears"   "bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"    

> str_split(fruits, " and ", simplify = TRUE)
     [,1]         [,2]      [,3]     [,4]     
[1,] "apples"     "oranges" "pears"  "bananas"
[2,] "pineapples" "mangos"  "guavas" ""       
> str_split(fruits, " and ", n = 3)
[[1]]
[1] "apples"            "oranges"           "pears and bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"    

> str_split(fruits, " and ", n = 2)
[[1]]
[1] "apples"                        "oranges and pears and bananas"

[[2]]
[1] "pineapples"        "mangos and guavas"

> # 用str_split_fixed函数返回一个matrix
> str_split_fixed(fruits, " and ", 3)
     [,1]         [,2]      [,3]               
[1,] "apples"     "oranges" "pears and bananas"
[2,] "pineapples" "mangos"  "guavas"           
> str_split_fixed(fruits, " and ", 4)
     [,1]         [,2]      [,3]     [,4]     
[1,] "apples"     "oranges" "pears"  "bananas"
[2,] "pineapples" "mangos"  "guavas" "" 

11.str_subset 返回匹配的字符串

==相当于grep==

函数定义:
str_subset(string, pattern)
参数列表:
string: 字符串,字符串向量。
pattern: 匹配的字符。

# 11.str_subset 字符串分割 
> #str_subset
> fruit <- c("apple", "banana", "pear", "pinapple")
> str_subset(fruit, "a")
[1] "apple"    "banana"   "pear"     "pinapple"
> str_which(fruit, "a")
[1] 1 2 3 4
> str_subset(fruit, "^a")
[1] "apple"
> str_subset(fruit, "a$")
[1] "banana"
> str_subset(fruit, "[aeiou]")
[1] "apple"    "banana"   "pear"     "pinapple"
> str_subset(fruit, "^p", negate = TRUE)
[1] "apple"  "banana"
> # Missings never match
> str_subset(c("a", NA, "b"), ".")
[1] "a" "b"
> str_which(c("a", NA, "b"), ".")
[1] 1 3

12.str_detect 检查匹配字符串的字符

==相当于给grepl==

函数定义:str_detect(string, pattern)
参数列表:
string: 字符串,字符串向量。
pattern: 匹配字符。

# 12.str_detect 字符串分割
> fruit <- c("apple", "banana", "pear", "pinapple")
> str_detect(fruit, "a")
[1] TRUE TRUE TRUE TRUE
> str_detect(fruit, "b")
[1] FALSE  TRUE FALSE FALSE
> str_detect(fruit, "^a")
[1]  TRUE FALSE FALSE FALSE
> str_detect(fruit, "a$")
[1] FALSE  TRUE FALSE FALSE
> str_detect("aecfg", letters)
 [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[14] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> str_detect(fruit, "^p", negate = TRUE)
[1]  TRUE  TRUE FALSE FALSE

13.str_extract 从字符串中提取匹配字符

函数定义:
str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)
参数列表:
string: 字符串,字符串向量。
pattern: 匹配字符。
simplify: 返回值,TRUE返回matrix,FALSE返回字符串向量

# 13.str_extract 字符串分割 
> shopping_list <- c("apples 4x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d") # 提取数字 #提取匹配模式的第一个字符串
[1] "4" NA  NA  "2"
> str_extract(shopping_list, "[a-z]+") #提取字母
[1] "apples" "bag"    "bag"    "milk"  
> #str_extract
> shopping_list <- c("apples 4x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d") # 提取数字 #提取匹配模式的第一个字符串
[1] "4" NA  NA  "2"
> str_extract(shopping_list, "[a-z]+") #提取字母
[1] "apples" "bag"    "bag"    "milk"  
> str_extract_all(shopping_list, "[a-z]+") # 提取所有匹配模式的字母,结果返回一个列表
[[1]]
[1] "apples" "x"     

[[2]]
[1] "bag"   "of"    "flour"

[[3]]
[1] "bag"   "of"    "sugar"

[[4]]
[1] "milk" "x"   

> str_extract_all(shopping_list, "[a-z]+",simplify = T)
     [,1]     [,2] [,3]   
[1,] "apples" "x"  ""     
[2,] "bag"    "of" "flour"
[3,] "bag"    "of" "sugar"
[4,] "milk"   "x"  ""     
> str_extract_all(shopping_list, "\\d") # 提取所有匹配模式的数字
[[1]]
[1] "4" "4"

[[2]]
character(0)

[[3]]
character(0)

[[4]]
[1] "2"

> # 提取所有匹配模式的字符串,结果返回一个矩阵,通过simplify = TRUE设置
> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
     [,1]     [,2] [,3]   
[1,] "apples" ""   ""     
[2,] "bag"    "of" "flour"
[3,] "bag"    "of" "sugar"
[4,] "milk"   ""   ""     
> str_extract_all(shopping_list, "\\d", simplify = TRUE)
     [,1] [,2]
[1,] "4"  "4" 
[2,] ""   ""  
[3,] ""   ""  
[4,] "2"  ""  

14.str_locate 找到匹配的字符串的位置

str_locate()和str_locate_all()的区别在于前者只匹配首次,而后者可以匹配所有可能的值

# 14.str_locate 找到匹配的字符串的位置 
> x <- c("abcdef", "ghifjk")
> str_locate(x, "cde")
     start end
[1,]     3   5
[2,]    NA  NA
> str_locate_all(c("abcdefabc", "ghifjkabc"), "abc")
[[1]]
     start end
[1,]     1   3
[2,]     7   9

[[2]]
     start end
[1,]     7   9

15.str_to_upper/str_to_lower字符串大小写转换

# 15.str_to_upper/str_to_lower字符串大小写转换 
> text <- "We love biotree forever"
> str_to_upper(text)
[1] "WE LOVE BIOTREE FOREVER"
> str_to_lower(text)
[1] "we love biotree forever"
> str_to_title(text)
[1] "We Love Biotree Forever"
> str_to_sentence("we love biotree forever")
[1] "We love biotree forever"
上一篇下一篇

猜你喜欢

热点阅读