R语言与统计分析R语言作业数据科学与R语言

30-tidytext包学习:文本整理与情绪分析

2020-02-07  本文已影响0人  wonphen

1、载入数据

require(pacman)
p_load(tidyverse,tidytext,text2vec,jiebaR,plyr)
df <- read.csv("./signature.csv",stringsAsFactors = F,header = T) %>% 
  select("id",signature="Signature")

# 繁体字转换为简体字
p_load(ropencc)
df$signature <- run_convert(converter(TW2S), df$signature)

2、中文分词

wk <-worker(stop_word = "./dict/characters-master/stop_words",lines = T)

text <- df %>% mutate(words = map(signature,segment,jieba = wk)) %>% select(c("id","words")) %>% tbl_df()

# 将分词拼接为一个长的字符串
text$words <- map(text$words,paste,collapse=" ")

3、使用unnest_tokens()函数整理为tidy结构

# 将text中的words列整理后命名为words
text.df <- text %>% unnest_tokens(words,words)

# 将文本按章节整理
# austen_chapters <- austen_books() %>% unnest_tokens(chapter, text, token = "regex", pattern = "Chapter|CHAPTER [\\dIVXLC]")

4、去除停用词

# 使用第二种方法去除停用词
stop.words <- read.table("./dict/停用词表.txt",header = F,sep="\n",quote = "",
                         fileEncoding = "UTF-8",col.names = "words")
# 去掉停用词两边的空格
stop.words$words <- str_trim(stop.words$words)

# 向停用词词典中添加新词
# stop.words <- bind_rows(tibble(words=c("于","有")),stop.words)

text.df <- text.df %>% anti_join(stop.words,by="words")

5、dplyr::count()函数查找频次最高的词

text.df %>% dplyr::count(words,sort=TRUE)
## # A tibble: 1,267 x 2
##    words     n
##    <chr> <int>
##  1 人生     22
##  2 心       22
##  3 不       17
##  4 爱       16
##  5 中       15
##  6 生活     14
##  7 努力     13
##  8 有       12
##  9 无       11
## 10 做       11
## # ... with 1,257 more rows

6、根据词频画条形图

text.df %>% dplyr::count(words,sort=TRUE) %>%
  filter(n>11) %>%
  ggplot(aes(reorder(words,n),n)) +
  geom_col() +
  coord_flip() +
  labs(x="",y="")
词频条形图

7、计算词频TF

tf <- text.df %>% dplyr::count(id,words)

8、加载知网(hownet)情感词典

positive <- read.table("./dict/情感及修饰词/正面情感词语(中文).txt",header = F,
    stringsAsFactors = F,strip.white = T,skip = 1,col.names = "words")
# 去掉两边的空格
positive$words <- str_trim(positive$words)

negative <- read.table("./dict/情感及修饰词/负面情感词语(中文).txt",header = F,
    stringsAsFactors = F,strip.white = T,skip = 1,col.names = "words")

negative$words <- str_trim(negative$words)

9、匹配情感词典并可视化

# 计算每个签名中有多少个正向词
df.positive <- text.df %>% inner_join(positive,by="words") %>% dplyr::count(words,id)

df.positive <- aggregate(n~id,df.positive,sum)

# 计算每个签名中有多少个负向词
df.negative <- text.df %>% inner_join(negative) %>% dplyr::count(words,id=id) %>% select(id,n=n)
## Joining, by = "words"
df.negative <- aggregate(n~id,df.negative,sum)

df.sentiment <- df.positive %>% full_join(df.negative,by="id")

df.sentiment[is.na(df.sentiment)] <- 0

# 总体为正的为正面,总体为负的为负面,总体无情绪的为中立
df.sentiment <- df.sentiment %>% 
  mutate(sentiment = case_when(n.x - n.y > 0 ~ "正面",
                               n.x - n.y == 0 ~ "中立",
                               n.x - n.y < 0 ~ "负面")) %>%
  select(id,sentiment)

table(df.sentiment$sentiment) %>% as.data.frame() %>%
  ggplot(aes(as.factor(Var1),Freq)) +
  geom_col(show.legend = F) +
  labs(x="",y="")
情绪所占比例

10、词云图

p_load(wordcloud2)
temp <- text.df %>% dplyr::count(words) %>% filter(n>=2) 
wordcloud2(temp,size=1,color = "random-dark",backgroundColor = "gray",minRotation = -pi/4,
maxRotation = -pi/4,shape = "circle",fontFamily = "苹方")
词云图

11、最多的正、负情感词云

p_load(reshape2)
# 统计正面情感词
posi <- text.df %>% select(words) %>% dplyr::count(words) %>%
  inner_join(positive,by="words") %>%
  filter(n>=2) %>% arrange(-n) %>%
  mutate(sentiment = "positive")

# 统计负面情感词
nega <- text.df %>% select(words) %>%
  dplyr::count(words) %>% 
  inner_join(negative,by="words") %>%
  arrange(-n) %>%
  mutate(sentiment = "negative")

posi %>% rbind(nega) %>% 
  acast(words ~ sentiment,value.var = "n",fill = 0) %>%
  wordcloud::comparison.cloud(scale=c(3.3,.3),colors=c("gray80","gray20"),
                              match.colors = T,
                              rot.per = 0.1,title.size = 2.5,
                              title.bg.colors = c("green","red"),
                              title.colors = "gray20")
正负情感词云
上一篇下一篇

猜你喜欢

热点阅读