33-tidytext包学习：tm包与tidytext包共建语料

2020-02-10 本文已影响0人 wonphen

tidy结构要求：a table with one-token-per-document-per-row，这使得我们可以使用流行的一整套工具，比如 dplyr、tidyr 和 ggplot2来探索和可视化文本数据。

格式转换

然而，除了 tidytext 包之外，大多数现有的用于自然语言处理的 r 工具都不兼容这种格式。本文讨论的就是tm包生成的语料与tidy结构之间的互相转换。

1、使用科学辟谣网数据

require(pacman)
p_load(dplyr)
df <- read.csv("./rumors.csv",header = T,stringsAsFactors = F,strip.white = T) %>%
  select(title,correction) %>% tbl_df()

df$correction <- df$correction %>% gsub("<U+00A[0-9]>","",.) %>%
  # 清除所有括号中的内容
  gsub("\\(.*\\)","",.) %>% gsub("（.*）","",.) %>% gsub("\\\n","",.) %>%
  gsub("点击下载本组图片","",.) %>% textclean::replace_html()

2、构建DTM矩阵

p_load(tm,jiebaR,purrr)

# 中文分词
wk <- worker(stop_word = "./dict/characters-master/stop_words")
corpus <- df %>% mutate(words = map(correction,segment,wk)) %>% 
  select(title,words) %>% distinct(title,.keep_all = T)
# 手动拼接为一个长的字符串
txt <- map(corpus$words,paste,collapse=" ")

# 创建语料库
d.corpus <- Corpus(VectorSource(txt))

# 剔除多余的空白
d.corpus <- tm_map(d.corpus,stripWhitespace)

## Warning in tm_map.SimpleCorpus(d.corpus,
## stripWhitespace): transformation drops documents

# 剔除标点符号
d.corpus <- tm_map(d.corpus,removePunctuation)

## Warning in tm_map.SimpleCorpus(d.corpus,
## removePunctuation): transformation drops documents

# 剔除数字
d.corpus <- tm_map(d.corpus,removeNumbers)

## Warning in tm_map.SimpleCorpus(d.corpus, removeNumbers):
## transformation drops documents

ctrl <- list(wordlengths=c(1,Inf))
dtm <- DocumentTermMatrix(d.corpus,control = ctrl)

print(dtm)

## <<DocumentTermMatrix (documents: 1763, terms: 35923)>>
## Non-/sparse entries: 202385/63129864
## Sparsity           : 100%
## Maximal term length: 80
## Weighting          : term frequency (tf)

3、转换为tidy结构，转换后其他的操作跟tidytext一致

p_load(tidytext)
# 转换为one-token-per-document-per-row的数据框
dtm.td <- tidy(dtm)

print(dtm.td)

## # A tibble: 202,385 x 3
##    document term           count
##    <chr>    <chr>          <dbl>
##  1 1        一是               1
##  2 1        专家               2
##  3 1        个人行为           1
##  4 1        中                 2
##  5 1        中华预防医学会     1
##  6 1        中国               1
##  7 1        中国科学院         1
##  8 1        中心               1
##  9 1        主任               2
## 10 1        二是               1
## # ... with 202,375 more rows

# 将语料中的document名称改为文章名称
name <- df %>% select(title) %>%
  mutate(id=as.character(1:length(df$title)))

dtm.td.name <- dtm.td %>%  left_join(name,by=c("document"="id"),copy=T) %>%
  select(document=title,term,count)

4、tidy结构转换为DTM矩阵

dtm.matrix <- dtm.td.name %>% cast_dtm(document,term,count)
print(dtm.matrix)

注：目前还不清楚为何转换后文档数量会变少，同时也无法确定文档名称是否与原有文章一一对应。

## <<DocumentTermMatrix (documents: 1668, terms: 35923)>>
## Non-/sparse entries: 202228/59717336
## Sparsity           : 100%
## Maximal term length: 80
## Weighting          : term frequency (tf)

inspect(dtm.matrix[5:10, 10:20])

## <<DocumentTermMatrix (documents: 6, terms: 11)>>
## Non-/sparse entries: 10/56
## Sparsity           : 85%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                             Terms
## Docs                         出品人 传播 促使 低 二是
##   爱因斯坦的数学很烂              1    0    0  0    0
##   家养猪笼草可以有效灭蚊          1    0    0  0    0
##   利用磁铁和钢丝可制作永动机      1    0    0  0    0
##   月经期间洗头会致癌              1    0    0  1    0
##   孕妈肚脐凸起生的就是儿子        1    0    0  0    0
##   最聪明的孩子都吃素              1    0    0  1    0
##                             Terms
## Docs                         公用 几分钟 免疫学 体液
##   爱因斯坦的数学很烂            0      0      0    0
##   家养猪笼草可以有效灭蚊        0      0      0    0
##   利用磁铁和钢丝可制作永动机    0      0      0    0
##   月经期间洗头会致癌            0      0      0    0
##   孕妈肚脐凸起生的就是儿子      0      0      0    0
##   最聪明的孩子都吃素            0      0      0    3
##                             Terms
## Docs                         信息
##   爱因斯坦的数学很烂            0
##   家养猪笼草可以有效灭蚊        0
##   利用磁铁和钢丝可制作永动机    0
##   月经期间洗头会致癌            0
##   孕妈肚脐凸起生的就是儿子      0
##   最聪明的孩子都吃素            1

5、使用tidy结构文档快速创建DTM矩阵

test <- read.csv("./signature.jieba.csv",header = T,stringsAsFactors = F) %>% 
  dplyr::select(id,content)

signature.dtm <- test %>% unnest_tokens(word,content) %>%
  dplyr::count(id,word) %>%
  cast_dtm(id,word,n)

print(signature.dtm)

## <<DocumentTermMatrix (documents: 391, terms: 1412)>>
## Non-/sparse entries: 2684/549408
## Sparsity           : 100%
## Maximal term length: 4
## Weighting          : term frequency (tf)

6、移除稀疏元素

inspect(removeSparseTerms(dtm.matrix,0.5))

## <<DocumentTermMatrix (documents: 1668, terms: 1)>>
## Non-/sparse entries: 900/768
## Sparsity           : 46%
## Maximal term length: 1
## Weighting          : term frequency (tf)
## Sample             :
##                                     Terms
## Docs                                 中
##   “竹炭食物”排毒养颜？               28
##   2017十大“科学”流言揭晓             19
##   保温杯里泡枸杞”是万能养生方式      16
##   吃螃蟹的禁忌                       12
##   吃早餐危险、薯条治脱发             13
##   发烧吃点消炎药                     21
##   藿香正气水和头孢一起服用会产生剧毒 12
##   诺贝尔奖得主说，牛肉、牛奶都致癌   12
##   食品标签会提供消费者需要的所有信息 17
##   与“水”有关的流言                   13

7、查找至少出现450次的词语

findFreqTerms(dtm.matrix,450)

##  [1] "专家"   "中"     "中国"   "发现"   "含量"  
##  [6] "研究"   "科学"   "里"     "食物"   "作用"  
## [11] "健康"   "发生"   "导致"   "年"     "影响"  
## [16] "情况"   "效果"   "更"     "月"     "说"    
## [21] "说法"   "身体"   "一种"   "医院"   "吃"    
## [26] "时"     "营养"   "治疗"   "皮肤"   "真的"  
## [31] "疾病"   "维生素" "食品"   "高"     "患者"  
## [36] "谣言"   "食用"   "人体"   "含有"   "请"    
## [41] "来源"   "药物"   "五线谱" "作者"   "授权"  
## [46] "网站"   "蝌蚪"   "转载"

8、查找与“冠状病毒”有0.8以上相关性的词

findAssocs(dtm.matrix,"冠状病毒",0.8)

## $冠状病毒
##     ncov 念念不忘     非典 
##     0.91     0.91     0.91