HIVE:常用分析函数
2021-07-19 本文已影响0人
惊不意外
1、lag() over()
lag(pay_succ_time, 1, '1990-01-01 00:00:00') over(partition by user_pin order by pay_succ_time)
注:取用户的上一笔交易时间,若无上一笔交易(即本单为用户首单),则令上一笔交易时间=1990-01-01 00:00:00(第三个参数不写时,默认为 null)
拓展:年新用户判定
select user_pin, ---用户PIN
pay_succ_time, --完成时间
to_date(pay_succ_time) as dt
from
(
select
pay_succ_time, --完成时间
user_pin, ---用户PIN
datediff(pay_succ_time,lag(pay_succ_time, 1, '1990-01-01 00:00:00') over(partition by user_pin order by pay_succ_time)) as dis_date
from
(
select user_pin,pay_succ_time, --完成时间
row_number() over(partition by trade_no order by pay_succ_time asc) as rn
from db.table_nm
where dt between date_sub('$TX_DATE', 730) and '$TX_DATE'
and to_date(pay_succ_time) between date_sub('$TX_DATE', 730) and '$TX_DATE'
and trim(nvl(user_pin,''))<>''
)x
where rn = 1
)t
where dis_date > 365
2、sum() over()、count() over()
-- 汇总每个用户的交易额
sum(tx_amt) over(partition by user_pin))
-- 汇总每个用户的订单量
count(distinct ordr_num) over(partition by user_pin)
3、rank() over,dense_rank() over,row_number() over
rank() over:1 2 2 4
查出指定条件后的进行排名。特点是,加入是对学生排名,使用这个函数,成绩相同的两名是并列,下一位同学空出所占的名次。
select
name,subject,score
,rank() over(partition by subject order by score desc) rank
from student_score;
# output:1 2 2 4
dense_rank() over:1 2 2 3
与ran() over的區别是,两名学生的成绩并列以后,下一位同学并不空出所占的名次。
select
name,subject,score
,dense_rank() over(partition by subject order by score desc) rank
from student_score;
# output:1 2 2 3
row_number() over:1 2 3 4
这个函数不需要考虑是否并列,哪怕根据条件查询出来的数值相同也会进行连续排名
select
name,subject,score
,row_number() over(partition by subject order by score desc) rank
from student_score;
# output:1 2 3 4
使用rank() over的时候,空值是最大的,如果排序字段为null,可能造成null字段排在最前面,影响排序结果。
可以这样:rank() over(partition by course order by score desc nulls last)
来规避这个问题。
select
name,subject,score
,rank() over(partition by subject order by score desc nulls last) rank
from student_score;
拓展:求连续最大天数
-- step1
use dev;
drop table dev.fin_user_fig_continue_hold;
create table dev.fin_user_fig_continue_hold as
select user_pin,count(flag) as continue_hold_cnt
from
(
select
user_pin
,(row_number() over(partition by user_pin order by dt)) - datediff(dt,start_date) as flag
from dev.fin_user_fig_03
)t1
group by user_pin
-- step2
select
count(case when max_continue_hold>=30 then user_pin end) as one_mth_pin_cnt
,count(case when max_continue_hold>=90 then user_pin end) as three_mth_pin_cnt
,count(case when max_continue_hold>=180 then user_pin end) as nine_mth_pin_cnt
,count(case when max_continue_hold>=360 then user_pin end) as one_year_pin_cnt
,(case when max_continue_hold>=30 then user_pin end)/count(user_pin) as one_mth_pin_rate
,count(case when max_continue_hold>=90 then user_pin end)/count(user_pin) as three_mth_pin_rate
,count(case when max_continue_hold>=180 then user_pin end)/count(user_pin) as nine_mth_pin_rate
,count(case when max_continue_hold>=360 then user_pin end)/count(user_pin) as one_year_pin_rate
from
(
select user_pin,max(continue_hold_cnt) as max_continue_hold
from dev.fin_user_fig_continue_hold
group by user_pin
)t
拓展:求用户首单便捷方法——利用named_struct
select
pin as jd_pin
,struct1.orderid as jd_order_id
from
(
select
pin
,min(named_struct('consumerdate',consumerdate,'orderid',orderid)) as struct1
from db.table_nm
where dt='{TX_DATE}'
group by pin
) m
group by pin,struct1.orderid