小白生信入门

Data Carpentry Workshop - Day 2

2018-06-23  本文已影响2人  猪猪头看世界
IMG_7436.JPG

Main Contents

1. Install and load tidyverse

install.packages("tidyverse")
library("tidyverse")

2. What are dplyr and tidyr packages?

Characteristic of tibble:

3. Functions of dplyr.

3.1 Selecting columns and filtering rows
select(surveys,plot_id,species_id,weight) #select columns
filter(surveys,year == 1995)  # filter rows
3.2 Pipes
Three ways to do select and filter at the same time:
(1) using intermediate steps
surveys2 <- filter(surveys,weight <5)
surveys_sml1 <- select(surveys2,species_id,sex,weight)
view(surveys_sml)
(2) nest functions
surveys_sml2 <- select(filter(surveys,weight <5),species_id,sex,weight)
surveys_sml2
(3) pipes
surveys_sml3 <- surveys %>% 
filter(weight <5) %>% 
select(species_id,sex,weight)
surveys_sml3
Challenge
surveys_select <- surveys %>% 
filter(year < 1995) %>% 
select(year,sex,weight)
surveys_select
3.3 Split-apply-combine data analysis and the summarize() function
split-apply-combine paradigm: split the data into groups, apply some analysis to each group, and then combine the results.
(1) group_by() & summarize()
# group by single columns
surveys %>% 
group_by(sex) %>% 
summarize(mean_weight = mean(weight,na.rm = TRUE))
# group by multiple columns
surveys_test <- surveys %>% 
filter(!is.na(weight)) %>% 
group_by(sex,species_id) %>% 
summarize(mean_weight = mean(weight)) %>% 
print(n =15)
View(surveys_test)
# summarize multiple variables at the same time
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
          min_weight = min(weight)) %>% 
print(n =10)
(2) arrange
# rearrange by a column content
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
          min_weight = min(weight)) %>%
arrange(min_weight) %>% 
print(n=10)
#sort in descending order
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
          min_weight = min(weight)) %>%
arrange(desc(mean_weight)) %>% 
print(n=10)
(3) counting
 # count a single factor
surveys %>%
count(sex)
surveys %>%
count(sex,sort =TRUE)
# count combination of factors
surveys %>%
count(sex,species)
# count and arrange
surveys %>%
count(sex,species) %>% 
arrange(species,desc(n))
(4) challenge
#counting
surveys %>%
count(plot_id)
# group_by & summarize
surveys %>%
filter(!is.na(hindfoot_length)) %>% 
group_by(species_id) %>% 
summarize(mean_hindfoot_length = mean(hindfoot_length),
        min_hindfoot_length = min(hindfoot_length),
        max_hindfoot_length = max(hindfoot_length),
        n =n())
# arrange
surveys %>%
filter(!is.na(weight)) %>%
group_by(year) %>% 
filter(weight == max(weight)) %>% 
select(year,genus,species_id,weight) %>%
arrange(year)

4. tidyr functions: reshaping with spread and gather

4.1 spreading
three principal arguments:
spread code
# step1. prepare for data
surveys_gw <- surveys %>%
filter(!is.na(weight)) %>% 
group_by(genus,plot_id) %>% 
summarize(mean_weight = mean(weight))
str(surveys_gw)
# step2. spread
surveys_spread <- surveys_gw %>% 
spread(key = genus,value = mean_weight)
str(surveys_spread)
# step3. fill
surveys_spread %>%
spread(genus, mean_weight, fill = 0) %>%
head()
spread
4.2 Gathering
four principal arguments:
gather code
surveys_gather <- surveys_spread %>%
gather(key = genus, value = mean_weight, -plot_id)
str(surveys_gather)
gather
challenge

5. Export Data

(1)filter
surveys_complete <- surveys %>%
filter(!is.na(weight),           # remove missing weight
     !is.na(hindfoot_length),  # remove missing hindfoot_length
     !is.na(sex))                # remove missing sex
(2) extract
# Extract the most common species_id
species_counts <- surveys_complete %>%
count(species_id) %>% 
filter(n >= 50)
(3) Only keep the most common species
surveys_complete <- surveys_complete %>%
filter(species_id %in% species_counts$species_id)
dim(surveys_complete)
(4) save
write_csv(surveys_complete, path = "C:/Users/home/Desktop/Rcourse/DataCarpentry33/Outputs/surveys_complete-data.csv")
read_csv("C:/Users/home/Desktop/Rcourse/DataCarpentry33/Outputs/surveys_complete-data.csv")

下期预告

Data visualization with ggplot2

上一篇下一篇

猜你喜欢

热点阅读