Week1: swirl教程 2: Grouping and C

2021-10-15  本文已影响0人  Chamberzero

group_by()

by_package <- group_by(cran,package)
by_package
# A tibble: 225,468 × 11
# Groups:   package [6,023]
       X date       time        size r_version r_arch r_os      package      version country ip_id
   <int> <chr>      <chr>      <int> <chr>     <chr>  <chr>     <chr>        <chr>   <chr>   <int>
 1     1 2014-07-08 00:54:41   80589 3.1.0     x86_64 mingw32   htmltools    0.2.4   US          1
 2     2 2014-07-08 00:59:53  321767 3.1.0     x86_64 mingw32   tseries      0.10-32 US          2
 3     3 2014-07-08 00:47:13  748063 3.1.0     x86_64 linux-gnu party        1.0-15  US          3
 4     4 2014-07-08 00:48:05  606104 3.1.0     x86_64 linux-gnu Hmisc        3.14-4  US          3
 5     5 2014-07-08 00:46:50   79825 3.0.2     x86_64 linux-gnu digest       0.6.4   CA          4
 6     6 2014-07-08 00:48:04   77681 3.1.0     x86_64 linux-gnu randomForest 4.6-7   US          3
 7     7 2014-07-08 00:48:35  393754 3.1.0     x86_64 linux-gnu plyr         1.8.1   US          3
 8     8 2014-07-08 00:47:30   28216 3.0.2     x86_64 linux-gnu whisker      0.3-2   US          5
 9     9 2014-07-08 00:54:58    5928 NA        NA     NA        Rcpp         0.10.4  CN          6
10    10 2014-07-08 00:15:35 2206029 3.0.2     x86_64 linux-gnu hflights     0.1     US          7
# … with 225,458 more rows

summarize(by_package,mean(size))
# A tibble: 6,023 × 2
   package     `mean(size)`
   <chr>              <dbl>
 1 A3                62195.
 2 abc             4826665 
 3 abcdeFBA         455980.
 4 ABCExtremes       22904.
 5 ABCoptim          17807.
 6 ABCp2             30473.
 7 abctools        2589394 
 8 abd              453631.
 9 abf2              35693.
10 abind             32939.
# … with 6,013 more rows

pack_sum <- summarize(by_package,
                      count = n(),
                      unique =  n_distinct(ip_id),
                      countries = n_distinct(country),
                      avg_bytes = mean(size))
pack_sum

# A tibble: 6,023 × 5
   package     count unique countries avg_bytes
   <chr>       <int>  <int>     <int>     <dbl>
 1 A3             25     24        10    62195.
 2 abc            29     25        16  4826665 
 3 abcdeFBA       15     15         9   455980.
 4 ABCExtremes    18     17         9    22904.
 5 ABCoptim       16     15         9    17807.
 6 ABCp2          18     17        10    30473.
 7 abctools       19     19        11  2589394 
 8 abd            17     16        10   453631.
 9 abf2           13     13         9    35693.
10 abind         396    365        50    32939.
# … with 6,013 more rows

quantile()

quantile(pack_sum$count, probs = 0.99)
top_counts <- filter(pack_sum, count > 679)
# A tibble: 61 × 5
   package    count unique countries avg_bytes
   <chr>      <int>  <int>     <int>     <dbl>
 1 bitops      1549   1408        76    28715.
 2 car         1008    837        64  1229122.
 3 caTools      812    699        64   176589.
 4 colorspace  1683   1433        80   357411.
 5 data.table   680    564        59  1252721.
 6 DBI         2599    492        48   206933.
 7 devtools     769    560        55   212933.
 8 dichromat   1486   1257        74   134732.
 9 digest      2210   1894        83   120549.
10 doSNOW       740     75        24     8364.
# … with 51 more rows

top_counts_sorted <- arrange(top_counts,desc(count))

pip

ctrl+shift+M
GSFA : group_by -> summarize -> filter -> arrange

result3 <-
  cran %>%
  group_by(package) %>%
  summarize(count = n(),
            unique = n_distinct(ip_id),
            countries = n_distinct(country),
            avg_bytes = mean(size)
  ) %>%
  filter(countries > 60) %>%
  arrange(desc(countries), avg_bytes)
上一篇下一篇

猜你喜欢

热点阅读