TidyTuesday 可视化学习之柱状图与气泡图完美组图

2020-04-28 本文已影响0人热衷组培的二货潜

链接：

此图讲解

加载包

rm(list = ls())
library(tidyverse)
library(patchwork)
library(ggtext)
library(showtext)
# install.packages("nord")
library(nord)

全局主题设置

theme_set(theme_minimal(base_family = "Arial"))
theme_update(axis.text.x = element_text(size = 11, color = "grey20"),
             axis.text.y = element_text(size = 13, color = "black", face = "bold"),
             axis.ticks.x = element_line(color = "grey45"),
             axis.ticks.y = element_blank(),
             axis.ticks.length.x = unit(.4, "lines"),
             panel.grid = element_blank(),
             plot.background = element_rect(fill = "grey60", color = "grey60"))

数据清洗部分函数

涉及相关函数

str_split()
unnest()
mutate() 与 if_else、ifelse
fct_relevel

str_split()：

按照指定字符分割数据，至于这里为什么要加这么多分割字符，这就是数据的特殊性了，我们自己的数据要分割什么肯定清楚。

> str_split(c("Grandmaster Flash & The Furious Five"), " & | ft. | feat. | feat | and ")
[[1]]
[1] "Grandmaster Flash" "The Furious Five" 

> str_split(c("Kanye West ft. Rihanna & Kid Cudi"), " & | ft. | feat. | feat | and ")
[[1]]
[1] "Kanye West" "Rihanna"    "Kid Cudi"

unnest()：

> df_ranks %>% 
  mutate(
    # str_split 对数据进行分割
    artists = str_split(artist, " & | ft. | feat. | feat | and ")
  ) %>%
  select(ID, artists) %>%
  slice(5)

# A tibble: 1 x 2
     ID artists  
  <dbl> <list>   
1     5 <chr [2]>

Rsudio 中查看是这样的

Rstudio 中显示是这样

使用 unnest() 后：

df_ranks %>% 
  mutate(
    # str_split 对数据进行分割
    artists = str_split(artist, " & | ft. | feat. | feat | and ")
  ) %>%
  select(ID, artists) %>%
  slice(5) %>%
  unnest(artists)

# A tibble: 2 x 2
     ID artists         
  <dbl> <chr>           
1     5 Dr Dre          
2     5 Snoop Doggy Dogg

分组函数：mutate() 与 if_else()（同 ifelse)

当我们分组在两组时候，用 if_else 或者 ifelse 比较方便，当超过两个时候选择用 mutate() 更加方便。

mutate(
    era = case_when(
      year >= 1973 & year < 1985 ~ "Old-school DJ Era",
      year >= 1985 & year < 1997 ~ "Golden Age",
      year >= 1997 & year < 2009 ~ "Bling-Bling Era",
      year >= 2009 ~ "Internet Era",
      TRUE ~ "other"
    ),
    artists = if_else(artists == "Snoop Doggy Dogg", "Snoop Dogg", artists),
    artists = if_else(artists == "JAY-Z", "Jay-Z", artists),
    artists = if_else(artists == "Outkast", "OutKast", artists)
  )

n_distinct：

uniq ID 数目，在 tidyverse 包中常与 group_by() 连用

> x <- sample(1:10, 1e5, rep = TRUE)
> length(unique(x))
[1] 10

> n_distinct(x)
[1] 10

fct_relevel：

重新定义因子水平，更多内容可以参考《R for data science》这本书 forcats 章节

> f <- factor(c("a", "b", "c", "d"), levels = c("b", "c", "d", "a"))
> fct_relevel(f)
[1] a b c d
Levels: b c d a

将 a 优先级放置最前面
> fct_relevel(f, "a")
[1] a b c d
Levels: a b c d

将 b 和 a 优先级放置最前面
> fct_relevel(f, "b", "a")
[1] a b c d
Levels: b a c d

因子排序
> fct_relevel(f, sort)
[1] a b c d
Levels: a b c d

逆向因子
> fct_relevel(f, rev)
[1] a b c d
Levels: a d c b

更多内容直接在 R 中 ?fct_relevel() 查看就好，

本例中是按照 sum_points 像升序，然后按照 best 进行降序，最后再取 unique 的 artists 类型作为因子：
fct_relevel(factor(artists, 
                   levels = unique( artists[order(sum_points, -best)]) )
            )

# 数据链接：https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-04-14/rankings.csv
df_ranks <- readr::read_csv('2020-04-14rankings.csv')
df_ranks_era <-
  df_ranks %>% 
  mutate(
    # str_split 对数据进行分割
    artists = str_split(artist, " & | ft. | feat. | feat | and ")
  ) %>% 
  unnest(artists) %>% 
  mutate(
    era = case_when(
      year >= 1973 & year < 1985 ~ "Old-school DJ Era",
      year >= 1985 & year < 1997 ~ "Golden Age",
      year >= 1997 & year < 2009 ~ "Bling-Bling Era",
      year >= 2009 ~ "Internet Era",
      TRUE ~ "other"
    ),
    artists = if_else(artists == "Snoop Doggy Dogg", "Snoop Dogg", artists),
    artists = if_else(artists == "JAY-Z", "Jay-Z", artists),
    artists = if_else(artists == "Outkast", "OutKast", artists)
  ) %>% 
  group_by(artists) %>% 
  mutate(
    n_songs = n_distinct(ID),
    sum_points = sum(points),
    ID = as.numeric(ID),
    best = min(ID)
  ) %>% 
  filter(
    best <= 75, 
    !artists %in% c("Dido", "Rihanna")
  ) %>% 
  ungroup() %>% 
  arrange(ID) %>% 
  mutate(
    artists = fct_relevel(factor(artists, levels = unique(artists[order(sum_points, -best)]))),
    ID = as.numeric(as.factor(ID))
  ) %>% 
  arrange(artists)

绘图部分函数

geom_col()
geom_curve()
annotate("text")
coord_flip()
nord::scale_fill_nord()
scale_x_discrete() 与 scale_y_continuous()
theme()
scale_size()
patchwork::plot_layout()

nord

一个配色包，本文用的到配色主题

https://cran.r-project.org/web/packages/nord/readme/README.html

修改 xy 轴 label 位置

这里由于之前进行了 coord_flip() ，将 xy 轴互换了。

scale_x_discrete(position = "top") + # label 至于最上，这里相当于将互换后 y 轴放置到最右边
  scale_y_continuous(expand = c(.02, .02),
                     limits = c(-200, 0),
                     breaks = seq(-175, 0, by = 25),
                     labels = rev(c(seq(0, 150, by = 25), "175 points")),
                     position = "right" # 转置后，x 放置于最上面
                     )

theme

theme 中有一个参数 plot.margin 是用来控制与边的距离，之前提到过，

plot.margin：控制上下左右边距（上，左，下，右），拼图要善用此函数

theme(axis.text.y.right = element_text(hjust = .5), # 表示 y 轴 label 剧中对齐
        plot.margin = margin(5, 0, 5, 5)) 

axis.ticks.x = element_blank() 去除 x 轴刻度尺
axis.ticks.y = element_blank() 去除 y 轴刻度尺

scale_size

重新标准化 ggplot2 中 size 映射的大小

scale_size(range = c(2, 5.5), guide = F)

patchwork

拼图神器，这里用函数 plot_layout 函数中的参数 widths 来控制两个图的宽度比。

bars + dots + plot_layout(widths = c(1, .35))

左侧的 barplot

cols <- c("grey60", "#ffc205", "#cecece", "#4e8863")



bars <- 
  df_ranks_era %>% 
  ggplot(aes(artists, -points)) +
  geom_col(aes(fill = ID),
           color = "white",
           size = .5,
           width = 1.02) +
  geom_curve(aes(x = 51.2, xend = 47, 
                 y = -148, yend = -166),
             curvature = -.4) +
  annotate("text", x = 47, y = -185, 
           label = "Each rectangle represents\none song included in the\nBBC ranking, its length\n the total points and the\ncolor indicates the rank",
           family = "Arial",
           size = 3.8,
           lineheight = .9) +
  annotate("text", x = 21.5, y = -120, 
           label = 'The Top Artists featured in the BBC´s\n"Greatest Hip-Hop Songs of All Time"',
           family = "Arial",
           fontface = "bold",
           size = 12,
           lineheight = .9) +
  annotate("text", x = 17, y = -120,
           label = 'In Autumn 2019, 108 hip-hop and music experts ranked their 5 favorites out of\n311 nominated songs in an online survey by the BBC. The graphic shows points\nscored in total and per song for the top ranked artists and broken down by era.',
           family = "Arial",
           fontface = "bold",
           color = "grey30",
           size = 5.5,
           lineheight = .9) +
  coord_flip() +
  scale_x_discrete(position = "top") +
  scale_y_continuous(expand = c(.02, .02),
                     limits = c(-200, 0),
                     breaks = seq(-175, 0, by = 25),
                     labels = rev(c(seq(0, 150, by = 25), "175 points")),
                     position = "right") +
  nord::scale_fill_nord(palette = "halifax_harbor", 
                        discrete = F, 
                        reverse = F, 
                        guide = F) +
  theme(axis.text.y.right = element_text(hjust = .5),
        plot.margin = margin(5, 0, 5, 5)) +
  labs(x = NULL, y = NULL)

右侧的气泡图

dots <-
  df_ranks_era %>% 
  group_by(artists, era) %>% 
  summarize(
    n_songs = n_distinct(ID),
    best = min(ID)
  ) %>% 
  ungroup() %>% 
  mutate(
    era = factor(era, levels = c("Old-school DJ Era", "Golden Age", "Bling-Bling Era", "Internet Era")),
    era_num = as.numeric(era)
  ) %>%
  ggplot(aes(artists, era_num, group = artists)) +
  geom_point(aes(artists, 1), color = "grey75", size = 2) +
  geom_point(aes(artists, 2), color = "grey75", size = 2) +
  geom_point(aes(artists, 3), color = "grey75", size = 2) +
  geom_point(aes(artists, 4), color = "grey75", size = 2) +
  geom_segment(aes(x = artists, xend = artists, 
                   y = 1, yend = 4), 
               color = "grey75",
               size = .3) +
  geom_line(color = "black",
            size = .9) +
  geom_point(aes(fill = best, size = n_songs), 
             shape = 21, 
             color = "black", 
             stroke = 1.2) +
  geom_curve(aes(x = 47, xend = 51, 
                 y = 6.1, yend = 4.3),
             curvature = .4) +
  annotate("text", x = 45.1, y = 6.1, 
           label = "The dot size indicates\nthe number of songs,\nthe dot color the best\nrank in each era",
           family = "Chivo",
           size = 3.8, 
           lineheight = .9) +
  coord_flip() +
  scale_y_continuous(limits = c(.5, 7.3),
                     breaks = 1:4,
                     labels = c("Old-School Era ('73-'84)", 
                                "Golden Age ('85-'96)", 
                                "Bling-Bling Era ('97-'09)", 
                                "Internet Era ('09-'19)"), 
                     position = "right") +
  scale_size(range = c(2, 5.5), guide = F) +
  nord::scale_fill_nord(palette = "halifax_harbor", 
                        discrete = F, 
                        reverse = F, 
                        guide = F, 
                        limits = c(min(df_ranks_era$ID), max(df_ranks_era$ID))) +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_text(size = 11, face = "bold", 
                                   hjust = .1, vjust = 0, angle = 20),
        axis.text.y = element_blank(),
        plot.margin = margin(5, 5, 5, 0),
        plot.caption = element_text(face = "bold", color = "grey30", 
                                    size = 10, margin = margin(t = 15))) +
  labs(x = NULL, y = NULL,
       caption = "Visualization by Cédric Scherer  •  Data by BBC Music")

patchwork 拼接

bars + dots + plot_layout(widths = c(1, .35))

copy 跑完，实在太强了，出神入化。