利用R语言进行邮件分类

2019-01-11 本文已影响0人何同尘

学习内容来自机器学习实例分析

数据来源

邮件数据来自SpamAssaion的公开语料库可以在(http://spamassassin.apache.org/publiccorpus/)免费下载。
数据组成：
垃圾邮件 spam
易识别的正常邮件 easy
不易识别的正常邮件 hard
每式各两份

环境以及包要求

windows10+ Rstudio3.4
ggplot2 用于可视化
tm 用于文本处理（分割）

数据处理

邮件分类是一个二分类问题，在这里使用简单的朴素贝叶斯分类方法。

首先，将文件夹的文本导入到文本语料库中，形成文本矩阵，有多少封邮件就有多少个字符串。
对文本语料库进行分割，使用tm包处理，变成词项文档矩阵dtm,假设邮件N条，特征词M个，就形成MxN的特征词矩阵。

构建模型

利用处理的数据构建贝叶斯分类模型
考虑先验概率和拉斯平滑
检查模型准确度、错判率
推广到其他数据，是否有适用性
可视化

library(tm)
library(ggplot2)
#存储路径
spam.path <- 'ML_for_Hackers/03-Classification/data/spam/'
spam2.path <- 'ML_for_Hackers/03-Classification/data/spam_2/'
easyham.path <-'ML_for_Hackers/03-Classification/data/easy_ham/'
easyham2.path <- 'ML_for_Hackers/03-Classification/data/easy_ham_2/'
hardham.path <- 'ML_for_Hackers/03-Classification/data/hard_ham/'
hardham2.path <- 'ML_for_Hackers/03-Classification/data/hard_ham_2/'
#将doc文档变成文本语料库（文本矩阵）
get.msg <- function(path){
  con<-file(path,open = 'rt',encoding = 'latin1')
  text <- readLines(con)
  if (!is.na(which(text =="")[1])) {
    msg <- text[seq(which(text == "")[1] + 1, length(text))]
    close(con)
    return(paste(msg,collapse = "\n")) 
  }
  else
    {close(con)}
  #msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
  #msg <- text[seq(which(text=="")[1]+1,length(text),1)]
  #close(con)
  #return(paste(msg,collapse = "\n"))
  
}

spam.docs <-dir(spam.path)
spam.docs <- spam.docs[which(spam.docs!='cmds')]
all.spam <- sapply(spam.docs,function(p) get.msg(paste(spam.path,p,sep ="")))
#得到词项文档矩阵
get.tdm <- function(doc.vec){
  doc.corpus <- Corpus(VectorSource(doc.vec))
  control <- list(stopwords=TRUE,removePunctuation=T,
                  removeNumbers=T,minDocFreq=2)
  doc.tdm <- TermDocumentMatrix(doc.corpus,control)
  return(doc.tdm)
}
spam.tdm <-get.tdm(all.spam)

#构建数据框保存特征词的条件概率
spam.matrix <- as.matrix(spam.tdm)
spam.counts <- rowSums(spam.matrix)
spam.df <- data.frame(cbind(names(spam.counts),
                            as.numeric(spam.counts)),stringsAsFactors = FALSE)
names(spam.df) <-c('term','frequency')
spam.df$frequency <- as.numeric(spam.df$frequency)
spam.occurrence <- sapply(1:nrow(spam.matrix), function(i)
  {length(which(spam.matrix[i,]>0))/ncol(spam.matrix)})
spam.density <- spam.df$frequency/sum(spam.df$frequency)
spam.df <- transform(spam.df,density=spam.density,occurrence = spam.occurrence)
head(spam.df[with(spam.df,order(-occurrence)),])



# Now do the same for the EASY HAM email
easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)],
                      function(p) get.msg(file.path(easyham.path, p)))

easyham.tdm <- get.tdm(all.easyham)

easyham.matrix <- as.matrix(easyham.tdm)
easyham.counts <- rowSums(easyham.matrix)
easyham.df <- data.frame(cbind(names(easyham.counts),
                               as.numeric(easyham.counts)),
                         stringsAsFactors = FALSE)
names(easyham.df) <- c("term", "frequency")
easyham.df$frequency <- as.numeric(easyham.df$frequency)
easyham.occurrence <- sapply(1:nrow(easyham.matrix),
                             function(i)
                             {
                               length(which(easyham.matrix[i, ] > 0)) / ncol(easyham.matrix)
                             })
easyham.density <- easyham.df$frequency / sum(easyham.df$frequency)

easyham.df <- transform(easyham.df,
                        density = easyham.density,
                        occurrence = easyham.occurrence)
head(easyham.df)


classify.email <- function(path,training.df,prior = 0.2,c = 1e-6){
  msg <- get.msg(path)
  msg.tdm <-get.tdm(msg)
  msg.freq <- rowSums(as.matrix(msg.tdm))
  #find interestions of words
  msg.match <- intersect (names(msg.freq),training.df$term)
  if (length(msg.match)<1) {
    return(prior*c^(length(msg.freq)))
  }
  else{
    match.probs <- training.df$occurrence[match(msg.match,training.df$term)]
    return(prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
  }
} 

hardham.docs <- dir(hardham.path)
haardham.docs <- hardham.docs[which(hardham.docs!='cmds')]

hardham.spamtest <- sapply(hardham.docs,
                           function(p) classify.email(file.path(hardham.path,p),
                                                      training.df = spam.df))

hardham.hamtest <- sapply(hardham.docs,
                          function(p) classify.email(file.path(hardham.path,p),
                                                     training.df = easyham.df))

hardham.res <- ifelse(hardham.spamtest>hardham.hamtest,T,F)
summary(hardham.res)



#用所有的邮件类型测试分类器

spam.classifier <- function(path){
  pr.spam <- classify.email(path,spam.df)
  pr.ham <- classify.email(path,easyham.df)
  return(c(pr.spam,pr.ham,ifelse(pr.spam>pr.ham,1,0)))
}

# Get lists of all the email messages
easyham2.docs <- dir(easyham2.path)
easyham2.docs <- easyham2.docs[which(easyham2.docs != "cmds")]

hardham2.docs <- dir(hardham2.path)
hardham2.docs <- hardham2.docs[which(hardham2.docs != "cmds")]

spam2.docs <- dir(spam2.path)
spam2.docs <- spam2.docs[which(spam2.docs != "cmds")]

# Classify them all!
easyham2.class <- suppressWarnings(lapply(easyham2.docs,
                                          function(p)
                                          {
                                            spam.classifier(file.path(easyham2.path, p))
                                          }))
hardham2.class <- suppressWarnings(lapply(hardham2.docs,
                                          function(p)
                                          {
                                            spam.classifier(file.path(hardham2.path, p))
                                          }))
spam2.class <- suppressWarnings(lapply(spam2.docs,
                                       function(p)
                                       {
                                         spam.classifier(file.path(spam2.path, p))
                                       }))

# Create a single, final, data frame with all of the classification data in it
easyham2.matrix <- do.call(rbind, easyham2.class)
easyham2.final <- cbind(easyham2.matrix, "EASYHAM")

hardham2.matrix <- do.call(rbind, hardham2.class)
hardham2.final <- cbind(hardham2.matrix, "HARDHAM")

spam2.matrix <- do.call(rbind, spam2.class)
spam2.final <- cbind(spam2.matrix, "SPAM")

class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.SPAM" ,"Pr.HAM", "Class", "Type")
class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)
class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

# Create final plot of results
class.plot <- ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) +
  geom_point(aes(shape = Type, alpha = 0.5)) +
  #geom_abline(yintercept = 0, slope = 1) +
  geom_abline(intercept = 0,slope = 1) +
  scale_shape_manual(values = c("EASYHAM" = 1,
                                "HARDHAM" = 2,
                                "SPAM" = 3),
                     name = "Email Type") +
  scale_alpha(guide = "none") +
  xlab("log[Pr(HAM)]") +
  ylab("log[Pr(SPAM)]") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank())

class.plot

get.results <- function(bool.vector)
{
  results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
               length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
  return(results)
}

# Save results as a 2x3 table
easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)
hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)
spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)

class.res <- rbind(easyham2.col, hardham2.col, spam2.col)
colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)

利用R语言进行邮件分类

数据来源

环境以及包要求

数据处理

构建模型

猜你喜欢

热点阅读