R文本分析(三)
主题模型训练
############################################
library(lda)
corpus <- lexicalize(sample.words, lower=TRUE)
num.topics <- 4#4个主题
## Initialize the params
params <- sample(c(-1, 1), num.topics, replace=TRUE)
poliblog.ratings<- sample(c(-100, 100), 207, replace=TRUE)
result <- slda.em(documents=corpus$documents,
K=num.topics,
vocab=corpus$vocab,
num.e.iterations=30,
num.m.iterations=12,
alpha=1.0, eta=0.1,
poliblog.ratings / 100,
params,
variance=0.25,
lambda=1.0,
logistic=FALSE,
method="sLDA")
## Make a pretty picture.
Topics <- apply(top.topic.words(result$topics, 8, by.score=TRUE),
2, paste, collapse=" ")
aa=length(Topics)
t=c()
for(i in 1:aa)
{t[i]=paste(i,Topics[i],sep="")}
a=apply(result$document_sums,
1,sum)
names(a)<-t
p=data.frame(a=t,b=a)
p=p[order(p[,2],decreasing=T),]
a1=c()
c=c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"
,"za","zb","zc","zd")
for(i in 1:aa)
{
a1[i]= paste(c[i],p$a[i],sep="")
}
p1=data.frame(a=a1,主题得分=p$b)
library(ggplot2)
ggplot(data=p1, aes(x=a, y=主题得分, fill=主题得分)) +
geom_bar(colour="black", stat="identity") +
labs(x = "主题", y = "得分") + ggtitle("文档主题排名顺序")+ coord_flip()
Topics <- top.topic.words(result$topics, 20, by.score=TRUE)
a=c()
b=c()
for(i in 1:5)
{
a=c(a,Topics[,i])
b=c(b,rep(paste("主题",i,sep=""),20))
}
a = table(a, b)
a = as.matrix(a)
library(wordcloud)
comparison.cloud(a, scale = c(1, 1.5), rot.per = 0.5, colors = brewer.pal(ncol(a),
"Dark2"))
主题1:金融主题2 :禅道主题3 :军事主题4 科技