R语言与统计分析R语言作业数据科学与R语言

36-text2vec包学习:利用多核机器的优势处理文件

2020-02-14  本文已影响0人  wonphen

1、酒店评论数据

library(pacman)
p_load(dplyr)
df <- read.csv("./data_set/review.csv",header = T,stringsAsFactors = F) %>% 
  tbl_df() %>% 
  select(id=ReviewID,date=RatingDate,review=ReviewText,rate=Obs_Avg_Rating) %>% 
  mutate(sentiment=ifelse(rate<3,0,1)) 

df <- df %>% select(-rate)
df$date <- as.Date(df$date)
nrow(df)
## [1] 11005
# 切割为5个文件
N_files <- 5
# 每个文件多少行数据
chunk_len <- nrow(df) / N_files

# 创建临时文件
files <- sapply(1:N_files,function(x) tempfile())

# 将文件分块
chunks <- split(df,rep(1:N_files,each=chunk_len))

# 将分块写入到对应的文件
for (i in 1:N_files) {
  write.table(chunks[[i]],files[[i]],quote=T,row.names=F,
              col.names = T,sep = "|")
}

str(df,strict.width="cut")
## Classes 'tbl_df', 'tbl' and 'data.frame':    11005 obs. of  4 variables:
##  $ id       : int  116455519 116885145 117395588 126717496 132233722..
##  $ date     : Date, format: "2011-08-08" ...
##  $ review   : chr  "I gave this place 2 because the staff was very "..
##  $ sentiment: num  0 0 0 0 0 1 0 1 1 1 ...

2、构造text2vec读取文件函数

p_load(data.table,text2vec)

reader <- function(x, ...) {
  # 读取数据
  chunk <- data.table::fread(x, header = T, sep = "|")
  # 选择列
  res <- chunk$review
  # 加入ids信息
  names(res) <- chunk$id
  res
}

# 创建迭代器
it_files <- ifiles(files, reader = reader)
# 
it_tokens = itoken(it_files,
                   preprocessor = tolower,
                   tokenizer = word_tokenizer,
                   progressbar = FALSE)

vocab <- create_vocabulary(it_tokens)

3、使用构造的函数创建DTM

请注意,DTM具有文档id。它们继承自我们在reader函数中分配的文档名称。 在处理文件时,这是分配文档id的方便方法。

dtm <- create_dtm(it_tokens,vectorizer = vocab_vectorizer(vocab))
str(dtm, list.len = 5)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:734787] 9012 558 10744 2736 1950 10921 9345 983 1874 758 ...
##   ..@ p       : int [1:26935] 0 1 2 3 4 5 6 7 8 9 ...
##   ..@ Dim     : int [1:2] 11005 26934
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:11005] "116455519" "116885145" "117395588" "126717496" ...
##   .. ..$ : chr [1:26934] "contar" "reacting" "veriy" "cranny's" ...
##   ..@ x       : num [1:734787] 1 1 1 1 1 1 1 1 1 1 ...
##   .. [list output truncated]

4、使用多核读取内存中的数据

library(doParallel)

# 查看系统核心数
N_WORKERS <- detectCores()

# 创建集群
cl <- makeCluster(N_WORKERS-1)

# 注册parallel后端
registerDoParallel(cl)

it_token_par <- itoken_parallel(df$review,
                                preprocessor = tolower,
                                tokenizer = word_tokenizer,
                                ids = df$id,
                                # 可以自己控制n_chunks参数
                                n_chunks = 8)
vocab <- create_vocabulary(it_token_par)
v_vectorizer <- vocab_vectorizer(vocab)
dtm <- create_dtm(it_token_par,v_vectorizer)

# 停止集群
stopCluster(cl)

str(dtm)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:734787] 558 1146 1481 2583 2588 2651 2741 2796 2991 3457 ...
##   ..@ p       : int [1:26935] 0 1 53 55 56 58 59 60 68 69 ...
##   ..@ Dim     : int [1:2] 11005 26934
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:11005] "116455519" "116885145" "117395588" "126717496" ...
##   .. ..$ : chr [1:26934] "reacting" "downside" "impacted" "janitorial" ...
##   ..@ x       : num [1:734787] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()

5、读取磁盘中的数据

library(doParallel)

# 查看系统核心数
N_WORKERS <- detectCores()

# 创建集群
cl <- makeCluster(N_WORKERS-1)

# 注册parallel后端
registerDoParallel(cl)

it_files_par <- ifiles_parallel(file_paths = files)

it_token_par <- itoken_parallel(it_files_par,
                                preprocessor = tolower,
                                tokenizer = word_tokenizer)
vocab <- create_vocabulary(it_token_par)

# 词向量DTM
v_vectorizer <- vocab_vectorizer(vocab)
dtm_v <- create_dtm(it_token_par,v_vectorizer)
str(dtm_v)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:788756] 559 3900 5157 2473 2738 935 1951 984 2086 6899 ...
##   ..@ p       : int [1:37943] 0 1 3 4 5 6 7 8 10 11 ...
##   ..@ Dim     : int [1:2] 11010 37942
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:11010] "filec082b5c5c_1" "filec082b5c5c_2" "filec082b5c5c_3" "filec082b5c5c_4" ...
##   .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
##   ..@ x       : num [1:788756] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()
# hash向量DTM
h_vectorizer <- hash_vectorizer()
dtm_h <- create_dtm(it_token_par,h_vectorizer)
str(dtm_h)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:788604] 6402 1093 8357 559 630 815 894 957 1347 2872 ...
##   ..@ p       : int [1:262145] 0 0 0 0 0 0 0 0 0 0 ...
##   ..@ Dim     : int [1:2] 11010 262144
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:11010] "filec089266d1a_1" "filec089266d1a_2" "filec089266d1a_3" "filec089266d1a_4" ...
##   .. ..$ : NULL
##   ..@ x       : num [1:788604] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()
# 词共现矩阵
tcm <- create_tcm(it_token_par,
                  vectorizer = v_vectorizer,
                  skip_grams_window = 5)
str(tcm)
## Formal class 'dgTMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:1107832] 216 285 193 292 494 422 235 180 416 361 ...
##   ..@ j       : int [1:1107832] 216 349 406 416 494 526 538 585 603 614 ...
##   ..@ Dim     : int [1:2] 37942 37942
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
##   .. ..$ : chr [1:37942] "reacting" "infact" "193827036" "cranny's" ...
##   ..@ x       : num [1:1107832] 0.2 1 0.25 0.25 0.5 ...
##   ..@ factors : list()
# 停止集群
stopCluster(cl)
上一篇 下一篇

猜你喜欢

热点阅读