数据预处理:dplyr package sample

2016-08-22  本文已影响63人  MC1229

library(dplyr)

options(width=105)

chicago<-readRDS("chicago,rds")

dim(chicago)

str(chicago)

names(chicago)

head(select(chicago), city:dptp))  #把city到dptp之间的变量都选出来

head(select(chicago), -(city:dptp)))  #把除city到dptp之间的变量外的都选出来

#如果不用dplyr,代码是

# i<-match("city", names(chicago))

# j<-match("dptp", names(chicago))

# head(chicago[, -(i:j)])

# FILTER

chic.f <-filter(chicago, pm25tmean2>30)

chic.f <-filter(chicago, pm25tmean2>30 & tmpd>80)

head(chic.f)

# Arrange

chicago <- arrange(chicago, date)   # 从小到大排列

chicago <- arrange(chicago, desc(date))    # 从大到小排列

head(chicago); tail(chicago)

# Rename

chicago <- rename(chicago, pm25=pm25mean2, dewpoint=dptp)

# MUTATE

chicago<-mutate(chicago, pm25detrend=pm25-mean(pm25, na.rm=TRUE))

# 创造新变量

#goup_by

#此组代码目的在于求出在天气冷热的不同条件下,空气污染的均值、最高值会否有差异

chicago<-mutate(chicago, tempat=factor(1*(tmpd>80),labels=c("cold","hot")))

hotcold<-group_by(chicago, tempcat)

summarize(hotcold, pm25=mean(pm25),o3=max(o3tmean2),no2=median(no2tmean2))

summarize(hotcold, pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

# 忽略pm25中的missing value

# Summarize 

#此组代码目的在于求出不同年份的数据有无差异

chicago<-mutate(chicago, year=as.POSIXlt(date)$year + 1900)

# 此方法可以简单地得到年份数据

years<-group_by(chicago, year)

summarize(years, pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

# Pipeline Operation

chicago %>% mutate(month=as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

上一篇 下一篇

猜你喜欢

热点阅读