31-tidytext包学习：计算TF和TF-IDF

2020-02-08 本文已影响0人 wonphen

1、读取并整理数据

library(pacman)
p_load(tidytext,tidyverse,stringr)

# 读取文件
txt <- readLines("./data_set/xslm/血色浪漫.txt",encoding = "unknown",n=-1L,ok=T,warn=T,skipNul=T)

血色浪漫

# 删除分节等重复信息
txt <- txt %>% gsub("出版社：长江文艺出版社作者：都梁|\\[上一篇\\]","",.) %>% str_trim() 

# 按【下一篇】拆分
txt2 <- txt %>% paste(collapse = " ") %>% str_split("\\[下一篇\\]") 

# 按章节整理为数据框
chapter <- vector("character",244)
content <- vector("character",244)
for (line in txt2) {
  temp <- line %>% str_split("引子|\\([0-9]+\\)")
  for (i in 1:length(line)) {
    chapter[i] <- temp[[i]][1] %>% gsub("[《血色浪漫上》]","",.) %>% str_trim(side = "both")
    content[i] <- temp[[i]][2] %>% str_trim(side = "both")
  }
  df <- tibble(chapter=chapter,content=content)
}

# 给第0章添加标题“引子”
df$chapter[1] <- "引子"

# 按章节（chapter）将各小节（content）合并
txt3 <- aggregate(content~chapter,df,paste)

# 去掉字符及字母
txt3$content <- txt3$content %>% gsub("[%a-z\\\\\\\"()]","",.) 

# 创建有效水平章节名称列表
chapter.levels <- c("引子","第一章","第二章","第三章","第四章","第五章","第六章",
                    "第七章","第八章","第九章","第十章","第十一章","第十二章",
                    "第十三章","第十四章","第十五章","第十六章","第十七章",
                    "第十八章","第十九章","第二十章","第二十一章","第二十二章",
                    "第二十三章","第二十四章","第二十五章","尾声")
txt3$chapter <- factor(txt3$chapter,levels = chapter.levels)
# 按章节重新排序
txt3 <- txt3 %>% arrange(chapter)

# 写入文件
# write.csv(txt3,"./txt3.csv")

整理后的文档

2、中文分词

使用dplyr::count()函数计算词频一直有问题，最后选择使用text2vec包。

p_load(jiebaR,plyr,text2vec)
# txt3 <- read.csv("./txt3.csv",header = T,stringsAsFactors = F)


# 将小说主要人物姓名保存为用户字典xslm
wk <-worker(user = "./dict/characters-master/xslm")

tok.fun <- function(strings) {llply(strings,segment,wk)}
# 设置分词迭代器
it <- itoken(txt3$content,
             preprocessor = identity,
             tokenizer = tok.fun,
             ids = txt3$chapter,
             progressbar = interactive())

# 创建词矩阵
vocab <- create_vocabulary(it)

3、计算TF和TF-IDF

3.1 计算TF

book.words <- tibble(chapter=vocab$doc_count,
                     word=as.character(vocab$term),
                     n=as.integer(vocab$term_count))

# 计算每章中频次最高的词
max.word <- aggregate(n~chapter,book.words,max)

# 词频tf=某个词在文章中出现的次数/该文出现次数最多的词出现的次数
book <- book.words %>% left_join(max.word,by="chapter",copy=T) %>% 
  dplyr::rename(n=n.x,max=n.y) %>% mutate(tf=n/max)
print(book)

# A tibble: 24,572 x 5
   chapter word         n   max     tf
     <int> <chr>    <int> <int>  <dbl>
 1       1 得太早       1    35 0.0286
 2       1 招待         1    35 0.0286
 3       1 全副武装     1    35 0.0286
 4       1 演义         1    35 0.0286
 5       1 低么         1    35 0.0286
 6       1 黄皮书       1    35 0.0286
 7       1 养人         1    35 0.0286
 8       1 付些         1    35 0.0286
 9       1 解           1    35 0.0286
10       1 破裂         1    35 0.0286
# ... with 24,562 more rows

3.2 查看前6章节TF的分布情况

book %>% filter(chapter<7) %>%
  ggplot(aes(tf, fill = chapter)) +
    geom_histogram(show.legend = FALSE,bins = 40,na.rm = T,col="white") +
  #  xlim(NA, 1) +
    labs(x=NULL,y=NULL) +
    facet_wrap(~chapter, ncol=2,scales = "free_y")

词频长尾分布

3.3 齐普夫定律（Zipf’s law）

在一个自然语言的语料库中，一个词的出现频数和这个词在这个语料中的排名（这个排名是基于出现次数的）成反比。Zipf定律是文献计量学的重要定律之一，它和罗特卡定律、布拉德福定律一起被并称为文献计量学的三大定律。

book %>% select(chapter,n) %>% mutate(rank=10*(1:length(n))) %>%
  ggplot(aes(rank,n)) + geom_point(col="red") + scale_x_log10()

齐普夫定律

3.4 TF-IDF矩阵

book.tfidf <- book %>% select(-max) %>% bind_tf_idf(word,chapter,n) %>%
  arrange(-tf_idf)

# tf_idf表示词语在文章中的重要程度，可视化前两章最重要的13个词
book.tfidf %>% mutate(word=factor(word,levels = rev(unique(word)))) %>%
  filter(chapter<3) %>% group_by(chapter) %>% top_n(13,tf_idf) %>%
  ungroup() %>% ggplot(aes(word,tf_idf,fill=chapter)) +
  geom_col(show.legend = F) +
  labs(x=NULL,y="tf_idf") +
  facet_wrap(~chapter,ncol = 2,scales = "free") +
  coord_flip()

TF-IDF