R 新手练习项目:爬取stackoverflow问题和答案

2020-12-20  本文已影响0人  Summer_Dargon

R 新手练习项目:爬取 stackoverflow 问题和答案

爬取stackoverflow Frequent Questions 页面所有问题的详细内容。

load library

library(rvest)

library(stringr)

function to get question links

getQuestionLinks = function(url){

 page = read_html(url)

 links = page %>% html_nodes('.summary .question-hyperlink') %>% html_attr('href')

 links = paste0('https://stackoverflow.com', links)

 return(links)

}

function to extract question & answer data as a list

extractQuestionAndAnswers = function(url){

 page = read_html(url)

 ###extract question id###

 id = page %>% html_node('.question') %>% html_attr('data-questionid')

 #extract question vote count

 question_vote_count = page %>% html_node('#question .ai-center') %>% html_text()

 #extract question&answer text

 question_text = page %>% html_nodes('#question .js-post-body') %>% html_text() %>% trimws()

 #extract question user detail

 quser_detail <- page %>% html_nodes('.pt4')

 #extract question user name

 temp_quser <- quser_detail %>% html_nodes('.user-details a') %>% html_text()

 question_user <- ifelse(length(temp_quser)>1,temp_quser[2],temp_quser[1])

 #extract question user time

 temp_qtime <- quser_detail %>% html_nodes('.relativetime') %>% html_attr('title') %>% str_remove('Z')

 question_time <- ifelse(length(temp_qtime)>1,temp_qtime[2],temp_qtime[1])

 #extract question tags

 tags = page %>% html_nodes('#question .ps-relative') %>% html_text() %>% trimws()

 ###extract all answers user detail###

 auser_detail = page %>% html_nodes('#answers .gsy')

 answer_users <- vector()

 answer_times <- vector()

 for(a in auser_detail){

 #extract answer user name

 temp_auser <- a %>% html_nodes('.user-details a') %>% html_text() %>% str_remove_all("[\r\n' ']") %>% str_remove_all('.*%')

 answer_user <- ifelse(length(temp_auser)>1,temp_auser[2],temp_auser[1])

 answer_users <- c(answer_users,answer_user)

 #extract answer time

 temp_atime <- a %>% html_nodes('.relativetime') %>% html_attr('title') %>% str_remove('Z')

 answer_time <- ifelse(length(temp_atime)>1,temp_atime[2],temp_atime[1])

 answer_times <- c(answer_times,answer_time)

 }

 #map every answers with the question id

 question_id <- rep(id,length(auser_detail))

 #extract answers vote count

 answer_vote_count = page %>% html_nodes('#answers .fs-title') %>% html_text()

 #extract accepted answer? TURE:FALSE

 accepted_answer = page %>% html_nodes('.answer') %>% html_attr('itemprop') %>% str_detect('acceptedAnswer')

 #extract answers text

 answer_text = page %>% html_nodes('#answers .js-post-body') %>% html_text() %>% str_remove_all("[\r\n]")

 question <- data.frame(id,question_vote_count,question_text,question_user,question_time,tags,url)

 answer <- data.frame(question_id,answer_vote_count,accepted_answer,answer_text,answer_users,answer_times)

 return (list(question=question, answers=answer))

}

get the first 2 pages

links = character()

for (page in 1:2){

 url = paste0('https://stackoverflow.com/questions?tab=Frequent&page=', page)

 Sys.sleep(0.2)

 thisLinks = getQuestionLinks(url)

 for (link in thisLinks){ if (!(link %in% links)){

 links = c(links, link)

 }

 }

}

print(links)

define an empty data frame for storing the forum posts

questionDF = data.frame(id=numeric(), vote_count=numeric(), question_text=character(), user=character(), question_time=character(), tags=character(), url=character())

answerDF = data.frame(question_id = numeric(), vote_count = numeric(), accepted_answer = logical(), answer_text = character(), user=character(), answer_time=character())

for each thread, get the posts

count = 0

for (url in links){

 Sys.sleep(0.2)

 print(url)

 results = extractQuestionAndAnswers(url)

 questionDF = rbind(questionDF, results$question)

 answerDF = rbind(answerDF, results$answers)

 count = count + 1

}

view & write the result data frame

View(answerDF)

View(questionDF)

write.csv(questionDF, "questions.csv")

write.csv(answerDF, "answers.csv")
stockoverflow1.png
上一篇 下一篇

猜你喜欢

热点阅读