[Python/R语言] 用R和python解决数据分析120题

2020-04-17  本文已影响0人  半为花间酒

转载请注明:陈熹 chenx6542@foxmail.com (简书号:半为花间酒)
若公众号内转载请联系公众号:早起Python
题源:
早起python 《Pandas进阶修炼120题》

数据:
https://pan.baidu.com/s/1DBWTFKrAeX3s9Nhmb7YvnA
提取码:wr6e

数据分析120题系列:

为什么出这个专题:

R语言和pandas都是数据处理的重要工具
而二者的高下争论时有存在
我相信对于数据而言没有绝对的孰优孰劣
需要做的应该是在必要时权衡最合适的办法

感谢 公众号早起python 提供数据分析120题
这些题目是一个契机
帮助我比较了两种语言处理不同问题的共性
当然也发现了各自的灵活和缺陷

它们覆盖多数数据分析初期可能遇到的问题
无论是对R语言还是对python技能的提升
相信都有很大帮助

(陈熹 2020年4月)

import pandas as pd
import numpy as np
print(np.__version__)
# 1.16.5
print(pd.__version__)
# 0.25.1
packageVersion("tidyverse")
# [1] ‘1.3.0’
packageVersion("dplyr")
# [1] ‘0.8.99.9002’

tem = np.random.randint(1,100,20)
df1 = pd.DataFrame(tem)
df1 <- sapply(20,function(n) {
  replicate(n,sample(1:100,1))
}) %>% 
  as.data.frame(.) %>% 
  dplyr::rename(`0` = V1)
pandas / R
tem = np.arange(0,100,5)
df2 = pd.DataFrame(tem)
df2 <- as.data.frame(seq(0,99,5)) %>% 
  dplyr::rename(`0` = "seq(0, 99, 5)")
pandas / R
tem = np.random.normal(0, 1, 20)
df3 = pd.DataFrame(tem)
df3 <- as.data.frame(rnorm(20,0,1)) %>% 
  dplyr::rename(`0` = "rnorm(20, 0, 1)")
pandas / R
df = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
df <- rbind(df1,df2,df3)
pandas / R
df = pd.concat([df1,df2,df3],axis=1,ignore_index=True)
df <- cbind(df1,df2,df3)
names(df) <- c(0,1,2)
pandas / R
np.percentile(df, q=[0, 25, 50, 75, 100])
summary(unlist(df))

因为df1和df3都是随机的,所以结果不同

pandas / R
df.columns = ['col1','col2','col3']
df <- df %>% 
  dplyr::rename(col1 = 1,
                col2 = 2,
                col3 = 3)
# 或者用类似pandas的方法
names(df) <- c('col1','col2','col3')
pandas / R
df['col1'][~df['col1'].isin(df['col2'])]
df[!(df$col1 %in% df$col2),1]
pandas / R
pandas / R
temp = df['col1'].append(df['col2'])
temp.value_counts()[:3]
count(unlist(c(df$col1,df$col2))) %>% 
  arrange(desc(freq)) %>% 
  filter(row_number() <= 3) 

因为col1是随机产生的,所以结果不同

pandas / R
np.argwhere(df['col1'] % 5==0)
which(df['col1'] %% 5==0)
pandas / R
df['col1'].diff().tolist()
df %>% 
  summarise(col1 - lag(col1)) %>% 
  na.omit(.) # 不去NA也可以,pandas没有去除
pandas / R
df.iloc[:, ::-1]
df %>% 
  select(col3,col2,everything())
pandas / R
df['col1'].take([1,10,15])
# 等价于
df.iloc[[1,10,15],0]
df[c(1,10,15) + 1,1]
pandas / R
res = np.diff(np.sign(np.diff(df['col1'])))
np.where(res== -2)[0] + 1
# array([ 2,  4,  7,  9, 12, 15], dtype=int64)
res1 <- which((df$col1 - lag(df$col1) > 0))
res2 <- which((df$col1 - lead(df$col1) > 0))

intersect(res1,res2)
# [1]  3  5  7 12 14 17 19

# 另一种方法,类似pandas的用符号判断

res <- sign(df$col1 - lag(df$col1))

which(res - lag(res) == -2) - 1
# # [1]  3  5  7 12 14 17 19

df[['col1','col2','col3']].mean(axis=1)
rowMeans(df)
pandas / R
np.convolve(df['col2'], np.ones(3)/3, mode='valid')
library(RcppRoll)

df %>% 
  summarise(avg_3 = roll_mean(col2, n=3))
pandas / R
df.sort_values("col3",inplace=True)
df <- df %>% 
  arrange(col3)
pandas / R
df.col1[df['col1'] > 50] = '高'
df[df$col1 > 50,1] <- '高'
pandas / R

Euclidean distance 欧几里得(欧氏)距离: 两变量差值平方和的平方根

np.linalg.norm(df['col1']-df['col2'])
# 194.29873905921264
# 可以利用概念计算
res <- (df$col1 - df$col2) ^ 2
sqrt(sum(res))
# [1] 197.0102

# 也可以利用dist函数,但需要形成两个不同的观测
dist(rbind(df$col1,df$col2))
#          1
# 2 197.0102

df1 = pd.read_csv(r'C:\Users\chenx\Documents\Data Analysis\数据1.csv',encoding='gbk', usecols=['positionName', 'salary'],nrows = 10)
res <- read.csv('数据1.csv',encoding = 'GBK',nrows = 3)
classes <- sapply(res, class)
classes[-match(c('positionName','salary'),names(classes))] <- 
  rep('NULL', length(classes) - 2)

df <- read.csv('数据1.csv',encoding = 'GBK',nrows = 10,
               colClasses = classes)
pandas / R
df2 = pd.read_csv(r'C:\Users\chenx\Documents\Data Analysis\数据2.csv',
                  converters={'薪资水平': lambda x: '高' if float(x) > 10000 else '低'} )
library(readr)

df2 <- read_csv('数据2.csv') %>% 
  mutate('学历要求',
         '薪资水平' = ifelse(
           薪资水平 > 10000,'高','低'))
pandas / R
df2.iloc[::20, :][['薪资水平']]
df2[seq(1,dim(df2)[1],20),]
pandas / R
df = pd.DataFrame(np.random.random(10)**10, columns=['data'])
df.round(3)
df <- tibble(data = runif(10)^10)
round(df,3)
pandas / R
df.style.format({'data': '{0:.2%}'.format})
tibble(data = str_glue('{round(df$data * 100,2)}%'))
pandas / R
df['data'].argsort()[len(df)-3]
df %>% 
  mutate(nrow = rownames(.)) %>% 
  arrange(desc(data)) %>% 
  filter(row_number() == 3) %>% 
  select(nrow)

df.iloc[::-1, :]
df %>% 
  arrange(desc(rownames(.)))
pandas / R
df1= pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})

df2= pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})

pd.merge(df1, df2, on=['key1', 'key2'])
df1 <- data.frame(
  "key1" = c("K0","K0","K1","K2"),
  "key2" = c("K0","K1","K0","K1"),
  "A" = paste0('A',0:3),
  "B" = paste0('B',0:3)
)

df2 <- data.frame(
  "key1" = c("K0","K1","K1","K2"),
  "key2" = paste0('K',rep(0,4)),
  "C" = paste0('C',0:3),
  "D" = paste0('D',0:3)
)

full_join(df1,df2,by = c('key1','key2')) %>% 
  na.omit(.)
pandas / R
pd.merge(df1, df2, how='left', on=['key1', 'key2'])
left_join(df1,df2,by = c('key1','key2'))
pandas / R
df = pd.read_csv(r'C:\Users\chenx\Documents\Data Analysis\数据1.csv',encoding='gbk')
pd.set_option("display.max.columns", None)
df <- read_csv('数据1.csv', locale = locale(encoding = "GBK")) %>% 
  print(width = Inf)

np.where(df.secondType == df.thirdType)
df %>% 
  mutate(nrow = rownames(.)) %>% 
  filter(secondType == thirdType) %>% 
  select(nrow) %>% 
  unlist()

(pandas的行位置从0开始,R则从1开始)

pandas / R

返回数据的位置

np.argwhere(df['salary'] > df['salary'].mean())[2]
# array([5], dtype=int64)
df %>% 
  mutate(nrow = rownames(.)) %>% 
  filter(salary > mean(salary)) %>% 
  select(nrow) %>% 
  filter(row_number() == 3)
# # A tibble: 1 x 1
#    nrow 
#    <chr>
#    1 6 

df[['salary']].apply(np.sqrt)
df %>% 
  summarise(salary_sqrt = sqrt(salary))
pandas / R
df['split'] = df['linestaion'].str.split('_')
df <- df %>% 
  mutate(split = str_split(linestaion,'_'))
pandas / R
df.shape[1]
# 54
length(df)
# [1] 54

df[df['industryField'].str.startswith('数据')]
df[grep("^数据", df$industryField),]

以salary score 和 positionID制作数据透视

pd.pivot_table(df,values=["salary","score"],index="positionId")
df <- df %>% 
  group_by(positionId) %>% 
  dplyr::summarise(salary = mean(salary),
            score = mean(score)) %>% 
  as.data.frame(.) 
rownames(df) <- NULL
tibble::column_to_rownames(df,var='positionId')
pandas / R
df[["salary","score"]].agg([np.sum,np.mean,np.min])
res <- df %>% 
  select(salary,score) %>% 
  pivot_longer(c(salary,score),names_to = 'type',values_to = 'value') %>% 
  group_by(type) %>% 
  summarise(sum = sum(value),mean = mean(value),min = min(value))

rownames(res) <- NULL

res %>% 
  column_to_rownames('type') %>% 
  t(.)
pandas / R
df.agg({"salary":np.sum,"score":np.mean})
df %>% 
  summarise(salary_sum = sum(salary),
            score_mean = mean(score))
pandas / R
df[['district','salary']].groupby(by='district').mean().sort_values(
    'salary',ascending=False).head(1)
df %>% 
  group_by(district) %>% 
  summarise(avg = mean(salary)) %>% 
  arrange(desc(avg)) %>% 
  filter(row_number() == 1)
pandas / R

以上就是R语言和python共同挑战数据分析120题的全部内容

上一篇下一篇

猜你喜欢

热点阅读