数据仓库Hive

HIVE:常用分析函数

2021-07-19  本文已影响0人  惊不意外

1、lag() over()

lag(pay_succ_time, 1, '1990-01-01 00:00:00') over(partition by user_pin order by pay_succ_time)
注:取用户的上一笔交易时间,若无上一笔交易(即本单为用户首单),则令上一笔交易时间=1990-01-01 00:00:00(第三个参数不写时,默认为 null)

拓展:年新用户判定

select user_pin, ---用户PIN
    pay_succ_time, --完成时间
    to_date(pay_succ_time) as dt
from
(
    select
    pay_succ_time, --完成时间
    user_pin, ---用户PIN
    datediff(pay_succ_time,lag(pay_succ_time, 1, '1990-01-01 00:00:00') over(partition by user_pin order by pay_succ_time)) as dis_date
    from
    (
        select user_pin,pay_succ_time, --完成时间
            row_number() over(partition by trade_no order by pay_succ_time asc) as rn
        from db.table_nm
        where dt between date_sub('$TX_DATE', 730) and '$TX_DATE'
        and to_date(pay_succ_time) between date_sub('$TX_DATE', 730) and '$TX_DATE'
        and trim(nvl(user_pin,''))<>''
    )x
    where rn = 1
)t
where dis_date > 365

2、sum() over()、count() over()

-- 汇总每个用户的交易额
sum(tx_amt) over(partition by user_pin))

-- 汇总每个用户的订单量
count(distinct ordr_num) over(partition by user_pin)

3、rank() over,dense_rank() over,row_number() over

rank() over:1 2 2 4
查出指定条件后的进行排名。特点是,加入是对学生排名,使用这个函数,成绩相同的两名是并列,下一位同学空出所占的名次。

select 
    name,subject,score
    ,rank() over(partition by subject order by score desc) rank
from student_score;
# output:1 2 2 4

dense_rank() over:1 2 2 3
与ran() over的區别是,两名学生的成绩并列以后,下一位同学并不空出所占的名次。

select 
    name,subject,score
    ,dense_rank() over(partition by subject order by score desc) rank
from student_score;
# output:1 2 2 3

row_number() over:1 2 3 4
这个函数不需要考虑是否并列,哪怕根据条件查询出来的数值相同也会进行连续排名

select 
    name,subject,score
    ,row_number() over(partition by subject order by score desc) rank
from student_score;
# output:1 2 3 4

使用rank() over的时候,空值是最大的,如果排序字段为null,可能造成null字段排在最前面,影响排序结果。
可以这样:rank() over(partition by course order by score desc nulls last)
来规避这个问题。

select 
    name,subject,score
    ,rank() over(partition by subject order by score desc nulls last) rank
from student_score;

拓展:求连续最大天数

-- step1
use dev;
drop table dev.fin_user_fig_continue_hold;
create table dev.fin_user_fig_continue_hold as
select user_pin,count(flag) as continue_hold_cnt
from
(
    select
        user_pin
        ,(row_number() over(partition by user_pin order by dt)) - datediff(dt,start_date) as flag
    from dev.fin_user_fig_03
)t1
group by user_pin

-- step2
select
    count(case when max_continue_hold>=30 then user_pin end) as one_mth_pin_cnt
    ,count(case when max_continue_hold>=90 then user_pin end) as three_mth_pin_cnt
    ,count(case when max_continue_hold>=180 then user_pin end) as nine_mth_pin_cnt
    ,count(case when max_continue_hold>=360 then user_pin end) as one_year_pin_cnt
    ,(case when max_continue_hold>=30 then user_pin end)/count(user_pin) as one_mth_pin_rate
    ,count(case when max_continue_hold>=90 then user_pin end)/count(user_pin) as three_mth_pin_rate
    ,count(case when max_continue_hold>=180 then user_pin end)/count(user_pin) as nine_mth_pin_rate
    ,count(case when max_continue_hold>=360 then user_pin end)/count(user_pin) as one_year_pin_rate
from
(
    select user_pin,max(continue_hold_cnt) as max_continue_hold
    from dev.fin_user_fig_continue_hold
    group by user_pin
)t

拓展:求用户首单便捷方法——利用named_struct

select
    pin as jd_pin
    ,struct1.orderid as jd_order_id
from
(
   select
        pin
        ,min(named_struct('consumerdate',consumerdate,'orderid',orderid)) as struct1
    from db.table_nm
    where dt='{TX_DATE}'
    group by pin
) m
group by pin,struct1.orderid
上一篇下一篇

猜你喜欢

热点阅读