32-tidytext包学习：n-grams和词语相关性可视化

2020-02-09 本文已影响0人 wonphen

1、读取数据

library(pacman)
p_load(dplyr,tidytext)

# 使用已经分好词的微信签名数据
txt <- read.csv("signature.jieba.csv",header = T,stringsAsFactors = F) %>%
  select(id,content) %>% filter(content != "")

2、n-grams增加文字信息量

如果是1-ngrams，有一句话：you need many money
分割成：

    terms
1: you 
2: need 
3: many 
4: money

那么2-ngrams呢？

        terms 
1: you need
2: need many
3: many money
4:  need many

txt.grams <- txt %>% unnest_tokens(words,content,token = "ngrams",n=2);txt.grams

##       id         words
## 1   V102     修身 齐家
## 2   V102       齐家 兼
## 3   V102         兼 济
## 4   V102       济 天下
##  [ reached 'max' / getOption("max.print") -- omitted 2099 rows ]

3、n-grams的计数和过滤

# 计数。分词是对的，但是计数时显示分词不准确，可能是tidytext对中文支持不好
txt.n <- txt.grams %>% dplyr::count(words,sort = T);txt.n

## # A tibble: 2,340 x 2
##    words       n
##    <chr>   <int>
##  1 <NA>       20
##  2 才 是       7
##  3 的 人       6
##  4 每 一天     5
##  5 我 的       5
##  6 爱 自己     4
##  7 的 时候     4
##  8 都 是       4
##  9 而 不       4
## 10 命 里       4
## # ... with 2,330 more rows

# 将每一行按空格切分
txt.separate <- txt.grams %>% tidyr::separate(words,c("wd1","wd2"),sep=" ")
txt.separate

##       id      wd1    wd2
## 1   V102     修身   齐家
## 2   V102     齐家     兼
## 3   V102       兼     济
## 4   V102       济   天下

##  [ reached 'max' / getOption("max.print") -- omitted 2559 rows ]

# 加载停止词
stop.words <- read.csv("./dict/characters-master/stop_words",
                       col.names = "words",stringsAsFactors = F)

# 去除包含停止词的行
txt.filter <- txt.separate %>%
  filter(!wd1 %in% stop.words$words) %>%
  filter(!wd2 %in% stop.words$words) ;txt.filter

# 重新计数
txt.count <- txt.filter %>% dplyr::count(wd1,wd2,sort = T);txt.count

## # A tibble: 2,330 x 3
##    wd1   wd2       n
##    <chr> <chr> <int>
##  1 <NA>  <NA>     20
##  2 才    是        7
##  3 的    人        6
##  4 每    一天      5
##  5 我    的        5
##  6 爱    自己      4
##  7 的    时候      4
##  8 都    是        4
##  9 而    不        4
## 10 命    里        4
## # ... with 2,320 more rows

# 再次组合，从而清除不包含停止词的词语组合
txt.united <- txt.filter %>%
  tidyr::unite(words,wd1,wd2,sep = " ");txt.united

##       id         words
## 1   V102     修身 齐家
## 2   V102       齐家 兼
## 3   V102         兼 济
## 4   V102       济 天下

##  [ reached 'max' / getOption("max.print") -- omitted 2539 rows ]

4、TF-IDF矩阵

检查词语组合的 tf-idf 相比检查单个词语的有利也有弊。词语组合可能捕捉到单词计数时没有出现的结构，并且可能提供使单词更容易理解的上下文(例如“格外清晰”比“清晰”的信息量更大)。然而“格外清晰”的数量组合也很少：词语组合对比它的任何一个组成词都要稀少。因此，当您有一个非常大的文本数据集时，bigrams 可能特别有用。

txt.tf.idf <- txt.united %>%
  dplyr::count(words,id,sort = T) %>%
  bind_tf_idf(words,id,n) %>%
  arrange(desc(tf_idf));head(txt.tf.idf)

## # A tibble: 6 x 6
##   words     id        n    tf   idf tf_idf
##   <chr>     <chr> <int> <dbl> <dbl>  <dbl>
## 1 啦 啦     V518      2     1  5.97   5.97
## 2 爱 家人   V498      1     1  5.97   5.97
## 3 宝藏 女孩 V627      1     1  5.97   5.97
## 4 点点 妈妈 V93       1     1  5.97   5.97
## 5 殿 武     V425      1     1  5.97   5.97
## 6 独自 旅行 V60       1     1  5.97   5.97

5、使用n-grams语料进行情绪分析

在前面的情感分析中（30-tidytext包学习：文本整理与情绪分析：https://www.jianshu.com/p/5620b63f15f5）仅仅是根据一个词汇来计算积极或消极词汇的出现次数。这种方法的一个问题是，一个词的上下文也很重要。例如，“高兴”和“喜欢”这两个词会被视为积极的，即使是在“我不高兴，我不喜欢它!”这样的句子中。通过对 bigram 数据进行情绪分析，我们可以检查与情绪相关的词前面是否存在“不”或其他否定词。利用这一点来忽略甚至逆转他们对情绪得分的贡献。

negation.words <- c("不","不是","从不","没有")
txt.negation <- txt.separate %>%
  filter(wd1 %in% negation.words) %>%
  count(wd1,wd2,id,sort = TRUE);txt.negation

## # A tibble: 53 x 4
##    wd1   wd2   id        n
##    <chr> <chr> <chr> <int>
##  1 不    可以  V401      2
##  2 不    辩    V326      1
##  3 不    读书  V401      1
##  4 不    多    V159      1
##  5 不    多    V76       1
##  6 不    付出  V149      1
##  7 不    辜负  V420      1
##  8 不    归    V516      1
##  9 不    害怕  V571      1
## 10 不    好奇  V546      1
## # ... with 43 more rows

另一方面，在给情感词赋权程度分值的时候，为了减少该词在“错误”的方向上的贡献，可以根据其前面出现否定词的频率适当降低分值。

6、bigram的网络图

p_load(igraph)
txt.separate %>%
  filter(wd1 %in% negation.words) %>%
  count(wd1,wd2,sort = TRUE) %>%
  graph_from_data_frame()

## IGRAPH 2546b8d DN-- 54 51 -- 
## + attr: name (v/c), n (e/n)
## + edges from 2546b8d (vertex names):
##  [1] 不  ->多   不  ->可以 不  ->困于 不  ->辩  
##  [5] 不  ->读书 不  ->付出 不  ->辜负 不  ->归  
##  [9] 不  ->害怕 不  ->好奇 不  ->后悔 不  ->坚强
## [13] 不  ->将   不  ->经   不  ->惊   不  ->卡  
## [17] 不  ->开始 不  ->哭   不  ->离   不  ->恋  
## [21] 不  ->怒   不  ->弃   不  ->轻言 不  ->轻易
## [25] 不  ->入   不  ->上学 不  ->是   不  ->谈  
## [29] 不  ->唐   不  ->完美 不  ->忘   不  ->写  
## + ... omitted several edges

p_load(ggraph)
txt.separate %>%
  filter(wd1 %in% negation.words) %>%
  count(wd1,wd2,sort = TRUE) %>%
  ggraph(layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label=name),vjust=1,hjust=1)

网络图

6、马尔可夫链（Markov chain）

a <- grid::arrow(type = "closed",length = unit(.15,"inches"))

txt.separate %>%
  filter(wd1=="好") %>%
  count(wd1,wd2,sort = TRUE) %>%
  ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

马尔可夫链

7、找出签名数据中常见的词语组合

p_load(widyr)

word.pairs <- txt %>%
  unnest_tokens(content,content) %>%
  filter(!content %in% stop.words) %>%
  pairwise_count(content,id,sort=T);word.pairs

## # A tibble: 20,482 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 是    的       26
##  2 的    是       26
##  3 你    的       14
##  4 的    你       14
##  5 的    有       10
##  6 的    我       10
##  7 有    的       10
##  8 我    的       10
##  9 人    的       10
## 10 的    人       10
## # ... with 20,472 more rows

# 找出第二个词为“的”的词语组合
word.pairs %>% filter(item2=="的")

## # A tibble: 460 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 是    的       26
##  2 你    的       14
##  3 有    的       10
##  4 我    的       10
##  5 人    的       10
##  6 了    的        8
##  7 都    的        8
##  8 自己  的        8
##  9 在    的        7
## 10 和    的        7
## # ... with 450 more rows

8、词语组合之间的相关性

word.cors <- txt %>%
  unnest_tokens(content,content) %>%
  dplyr::count(content,id,sort = T) %>%
  group_by(id) %>%
  filter(n>3) %>%
  pairwise_cor(content,id,sort=T);word.cors

## # A tibble: 20 x 3
##    item1 item2 correlation
##    <chr> <chr>       <dbl>
##  1 可以  不          1    
##  2 不    可以        1    
##  3 决定  不         -0.333
##  4 再    不         -0.333
##  5 珍惜  不         -0.333
##  6 不    决定       -0.333
##  7 可以  决定       -0.333
##  8 再    决定       -0.333
##  9 珍惜  决定       -0.333
## 10 决定  可以       -0.333
## 11 再    可以       -0.333
## 12 珍惜  可以       -0.333
## 13 不    再         -0.333
## 14 决定  再         -0.333
## 15 可以  再         -0.333
## 16 珍惜  再         -0.333
## 17 不    珍惜       -0.333
## 18 决定  珍惜       -0.333
## 19 可以  珍惜       -0.333
## 20 再    珍惜       -0.333

# 找出与“珍惜”最相关的词语
word.cors %>% filter(item1=="珍惜")

## # A tibble: 4 x 3
##   item1 item2 correlation
##   <chr> <chr>       <dbl>
## 1 珍惜  不         -0.333
## 2 珍惜  决定       -0.333
## 3 珍惜  可以       -0.333
## 4 珍惜  再         -0.333

# 连线的颜色深浅代表关系的强弱
word.cors %>%
  filter(correlation < 0) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

词语相关性