【文本挖掘】class 2
2019-03-07 本文已影响7人
caokai001
词云_R
课程链接
http://xiajingbo.weebly.com/uploads/1/3/3/0/13306375/2._r_package_and_word_cloud.leaflet_8pages_.pdf
###tm 需要安装R_3.5.2,同时安装R包过程中会存在一些依赖包
library(NLP)
library(tm)
cname<-c("C:/Users/16926/Desktop/研究生/【研究生】/研究生课程/文本挖掘/class2")
docs<-Corpus(DirSource(cname))
summary(docs)
#Removing punctuation:
docs <- tm_map(docs, removePunctuation)
for(j in seq(docs)){
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
}
#################Removing numbers:
docs <- tm_map(docs, removeNumbers)
###################Converting to lowercase:
docs <-tm_map(docs, tolower)
############Removing “stopwords” (common words) that usually have no analytic value
docs <- tm_map(docs, removeWords, stopwords("english"))
######Removing particular words
docs <- tm_map(docs, removeWords, c("department", "email", "doi", "center", "sciences", "pubmed", "nature","university", "pmid", "author", "school","research"))
####Tell R to treat your preprocessed documents as text documents.
docs <- tm_map(docs, PlainTextDocument)
#########To proceed, create a document term matrix.
dtm <- DocumentTermMatrix(docs)
##########You’ll also need a transpose of this matrix. Create it using:
tdm <- TermDocumentMatrix(docs)
#Organize terms by their frequency:
freq <- colSums(as.matrix(dtm))
freq
names(freq)
ord <- order(freq)
###If you prefer to export the matrix to Excel:
m <- as.matrix(dtm)
write.csv(m, file="dtm.csv")
###############Word Frequency
###There are lots of terms, just check some of the most and least frequently occurring words
freq[head(ord, 10)]
freq[tail(ord, 50)]
wf <- data.frame(word = names(freq), freq=freq)
head(wf)
############Plot words that appear at least 50 times
library(ggplot2)
p <- ggplot(subset(wf, freq>50), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
词频率分布
#########word cloud
set.seed(142)
library("wordcloud")
wordcloud(names(freq), freq, min.freq=25)
词云