番外.1
2021-03-10 本文已影响0人
半夜一更
昨天有个以前的同学提了个需求,大概是这样的:
image.png

计算单词的P评论数。
昨天忙不过来,到今天才把脚本写掉,具体思路是这样的:
# install.packages("openxlsx") #安装打开excel文件的包
library(openxlsx) #载入打开excel文件的包
getwd() #查看当前工作环境目录
setwd('XXX') #设置当前工作环境目录
rm(list=ls()) #清除缓存
data_raw <- read.xlsx("副本数据分析.xlsx",rowNames = F,sheet=1) #载入数据
dir_all <- NULL #设置一个空数据集
for(j in 1:length(data_raw$Title)){
title <- noquote(data_raw$Title[j]) #读取第j个标题
title <- gsub("\\("," ",gsub("\\)"," ",title)) #空格替换(和)
title <- gsub("\\,"," ",gsub("\\,"," ",title))#空格替换,和,
title <- gsub("\\["," ",gsub("\\]"," ",title)) #空格替换[和]
title <- gsub("\\/"," ",title) #空格替换/
title <- gsub("\\|"," ",title) #空格替换|
dir <-unlist(strsplit(title," ", fixed=TRUE)) #以空格分割单词
#dir <-unique(unlist(strsplit(title," ", fixed=TRUE))) #以空格分割单词,删除重复的单词
num_review <- data_raw$Num[j] #提取该条标题对应的评论数
for(i in 1:length(dir)){
dir_all <- c(dir_all,rep(dir[i],num_review)) #以评论的次数复制该条标题的单词,然后追加到最开始设置的数据集中
}
}
#统计数据集中的单词的频次
word <- unique(dir_all) #
result <- NULL
for(k in 1:length(word)){
row <- as.data.frame(sum(dir_all==word[k]),word[k])
colnames(row) <- c("Num")
result <- rbind(result,row)
}
#写入到csv文件中
write.csv(result,"result_20210310.csv")
先这样吧,以后如果他还有需要在优化吧。