hive学习(五)——面试题(活跃用户及红包用户)
2019-12-05 本文已影响0人
Gaafung峰
前言:
本次文章主要以hivesql进行查询,涉及同一场景但较为复杂。
来源于:面试题八
前期准备:
image.png1.建表导入数据
2.1用户活跃模型表
create table tmp_liujg_dau_based(
imp_date string comment '日期',
qimei string comment '用户唯一标识',
is_new string comment '新用户表示,1表示新用户,0表示老用户',
;
ALTER TABLE tmp_liujg_dau_based COMMENT '用户活跃模型表';
2.2红包参与领取模型表
create table tmp_liujg_packed_based
(
imp_date string comment '日期',
report_time string comment '领取时间戳',
qimei string not null comment '用户唯一标识',
add_money string not null comment '领取金额,单位为分');
ALTER TABLE tmp_liujg_packed_based COMMENT '红包参与领取模型表';
2.3用户活跃模型表数据导入
insert into tmp_liujg_dau_based values('20190301','001','0')
;insert into tmp_liujg_dau_based values('20190301','002','0')
;insert into tmp_liujg_dau_based values('20190301','003','1')
;insert into tmp_liujg_dau_based values('20190301','004','1')
;insert into tmp_liujg_dau_based values('20190301','005','1')
;insert into tmp_liujg_dau_based values('20190301','006','1')
;insert into tmp_liujg_dau_based values('20190302','001','0')
;insert into tmp_liujg_dau_based values('20190302','002','0')
;insert into tmp_liujg_dau_based values('20190302','003','0')
;insert into tmp_liujg_dau_based values('20190302','005','0')
;insert into tmp_liujg_dau_based values('20190302','006','0')
;insert into tmp_liujg_dau_based values('20190302','007','1')
;insert into tmp_liujg_dau_based values('20190303','005','0')
;insert into tmp_liujg_dau_based values('20190303','006','0')
;insert into tmp_liujg_dau_based values('20190303','007','0')
;insert into tmp_liujg_dau_based values('20190303','008','1')
;insert into tmp_liujg_dau_based values('20190303','009','1')
;insert into tmp_liujg_dau_based values('20190303','010','1')
;insert into tmp_liujg_dau_based values('20190401','008','0')
;insert into tmp_liujg_dau_based values('20190401','009','0')
;insert into tmp_liujg_dau_based values('20190401','010','0')
;insert into tmp_liujg_dau_based values('20190401','011','1')
;insert into tmp_liujg_dau_based values('20190401','012','1')
;insert into tmp_liujg_dau_based values('20190402','009','0')
;insert into tmp_liujg_dau_based values('20190402','010','0')
;insert into tmp_liujg_dau_based values('20190402','011','0')
;insert into tmp_liujg_dau_based values('20190402','012','0')
;insert into tmp_liujg_dau_based values('20190402','013','1')
;insert into tmp_liujg_dau_based values('20190402','014','1')
;insert into tmp_liujg_dau_based values('20190501','001','0')
;insert into tmp_liujg_dau_based values('20190501','002','0')
;insert into tmp_liujg_dau_based values('20190501','008','0')
;insert into tmp_liujg_dau_based values('20190501','007','0')
;insert into tmp_liujg_dau_based values('20190501','015','1')
;insert into tmp_liujg_dau_based values('20190501','016','1')
;insert into tmp_liujg_dau_based values('20190501','017','1')
;insert into tmp_liujg_dau_based values('20190501','018','1')
;insert into tmp_liujg_dau_based values('20190601','008','0')
;insert into tmp_liujg_dau_based values('20190601','017','0')
;insert into tmp_liujg_dau_based values('20190601','018','0')
;insert into tmp_liujg_dau_based values('20190601','019','1')
;insert into tmp_liujg_dau_based values('20190601','020','1')
;insert into tmp_liujg_dau_based values('20190601','021','1')
;insert into tmp_liujg_dau_based values('20190601','022','1')
;insert into tmp_liujg_dau_based values('20190603','021','0')
;insert into tmp_liujg_dau_based values('20190603','022','0')
;insert into tmp_liujg_dau_based values('20190603','011','0')
;insert into tmp_liujg_dau_based values('20190603','012','0')
;insert into tmp_liujg_dau_based values('20190603','023','1')
;insert into tmp_liujg_dau_based values('20190701','023','0')
;insert into tmp_liujg_dau_based values('20190701','008','0')
;insert into tmp_liujg_dau_based values('20190701','011','0')
;insert into tmp_liujg_dau_based values('20190701','022','0')
;insert into tmp_liujg_dau_based values('20190701','012','0')
;insert into tmp_liujg_dau_based values('20190701','024','1')
;insert into tmp_liujg_dau_based values('20190701','025','1')
;insert into tmp_liujg_dau_based values('20190701','026','1')
;insert into tmp_liujg_dau_based values('20190701','027','1')
;insert into tmp_liujg_dau_based values('20190705','026','0')
;insert into tmp_liujg_dau_based values('20190705','027','0')
;insert into tmp_liujg_dau_based values('20190705','009','0')
;insert into tmp_liujg_dau_based values('20190705','010','0')
;insert into tmp_liujg_dau_based values('20190705','028','1')
;insert into tmp_liujg_dau_based values('20190705','029','1')
;
2.2红包参与领取模型表数据导入
insert into tmp_liujg_packed_based values('20190301','2019/03/01 10:15:01','001','1.05')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 13:15:01','001','2.30')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 11:15:01','002','0.80')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 17:15:01','002','0.89')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 14:15:01','003','2.12')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 18:15:01','003','1.12')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 17:15:01','005','1.12')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 19:15:01','005','0.12')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 09:15:01','006','0.98')
;insert into tmp_liujg_packed_based values('20190301','2019/03/01 11:15:01','006','1.45')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 09:30:01','001','0.78')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 19:30:01','001','0.88')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 10:30:01','003','0.68')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 09:30:01','005','1.01')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 14:30:01','005','1.88')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 14:30:01','006','1.88')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 10:30:01','006','0.68')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 15:30:01','007','0.68')
;insert into tmp_liujg_packed_based values('20190302','2019/03/02 16:30:01','007','1.78')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 16:30:01','005','0.68')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 08:50:01','006','0.32')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 10:50:01','006','1.78')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 10:50:01','007','0.32')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 15:50:01','008','1.01')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 17:50:01','008','1.68')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 10:30:01','010','1.88')
;insert into tmp_liujg_packed_based values('20190303','2019/03/03 15:50:01','010','0.32')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 09:50:00','008','0.18')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 11:50:00','009','0.88')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 12:35:00','009','0.32')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 17:50:00','010','1.01')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 12:35:00','011','1.68')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 12:50:00','011','0.88')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 15:50:00','012','0.32')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 17:50:00','012','1.68')
;insert into tmp_liujg_packed_based values('20190401','2019/04/01 12:35:00','012','1.88')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 09:50:00','009','0.18')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 19:50:00','009','1.18')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 17:50:00','010','0.88')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 16:50:00','010','0.32')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 19:50:00','010','0.32')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 16:50:00','013','0.88')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 19:55:00','013','0.88')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 20:55:00','013','1.01')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 19:55:00','014','0.32')
;insert into tmp_liujg_packed_based values('20190402','2019/04/02 20:55:00','014','1.01')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 09:50:00','001','1.18')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 09:55:00','002','0.32')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 10:55:00','002','0.32')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 10:00:00','007','0.88')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 10:55:00','015','0.88')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 10:00:00','015','0.32')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 14:00:00','017','1.01')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 15:00:00','017','1.01')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 15:30:00','018','0.88')
;insert into tmp_liujg_packed_based values('20190501','2019/05/01 16:30:00','018','0.68')
;insert into tmp_liujg_packed_based values('20190601','2019/06/01 09:50:00','008','1.38')
;insert into tmp_liujg_packed_based values('20190601','2019/06/01 14:50:00','017','0.88')
;insert into tmp_liujg_packed_based values('20190601','2019/06/01 11:50:00','019','1.01')
;insert into tmp_liujg_packed_based values('20190601','2019/06/01 13:50:00','019','0.88')
;insert into tmp_liujg_packed_based values('20190601','2019/06/01 14:50:00','019','0.68')
;insert into tmp_liujg_packed_based values('20190602','2019/06/02 09:50:00','021','0.38')
;insert into tmp_liujg_packed_based values('20190602','2019/06/02 13:50:00','012','0.88')
;insert into tmp_liujg_packed_based values('20190602','2019/06/02 15:50:00','012','1.12')
;insert into tmp_liujg_packed_based values('20190602','2019/06/02 13:59:00','023','0.88')
;insert into tmp_liujg_packed_based values('20190602','2019/06/02 15:50:00','023','1.01')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 09:50:00','023','0.38')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 13:50:00','023','0.78')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 14:50:00','008','0.68')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 14:50:00','024','0.68')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 15:50:00','024','1.68')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 14:50:00','026','0.68')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 18:50:00','026','1.68')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 14:50:00','027','0.68')
;insert into tmp_liujg_packed_based values('20190701','2019/07/01 19:35:00','027','1.11')
;insert into tmp_liujg_packed_based values('20190702','2019/07/02 09:50:00','026','0.78')
;insert into tmp_liujg_packed_based values('20190702','2019/07/02 11:50:00','026','0.78')
;insert into tmp_liujg_packed_based values('20190702','2019/07/02 13:50:00','028','1.01')
;insert into tmp_liujg_packed_based values('20190702','2019/07/02 14:35:00','028','0.88')
;insert into tmp_liujg_packed_based values('20190702','2019/07/02 15:35:00','028','0.33
解题过程:
1.计算2019年3月,每个月领过红包用户和未领红包用户的数量,平均月活跃天数(即本月平均活跃多少天)
解法一:
Select a.month,packed_user,total-packed_user as no_packed_user,day from ((
Select substring(imp_date,0,6) as month,count(distinct qimei) total,count(distinct imp_date) as day from tmp_liujg_dau_based group by substring(imp_date,0,6) having month>="201903") a
Left join
(Select substring(imp_date,0,6) as month ,count(distinct qimei) packed_user from tmp_liujg_packed_based group by substring(imp_date,0,6) having month>="201903") b on a.month = b.month);
image.png
解法二:
With tmp as (
Select substring(a.imp_date,0,6) as month,a.qimei as total_qimei,b.qimei as packed_qimei,a.imp_date day,(case when b.qimei is not null then "hb" else "no_hb" end) packed_user from tmp_liujg_dau_based a left join tmp_liujg_packed_based b on
Substring(a.imp_date,0,6)=Substring(b.imp_date,0,6) and a.qimei=b.qimei)
Select month,packed_user,count(distinct total_qimei) ,count(distinct day) as day from tmp group by month,packed_user;
image.png
2.计算2019年3月至今,每个月活跃用户的注册日期,2019年3月1日前注册的用户日期填空即可
With tmp as(
Select min(imp_date) as new_date,qimei from tmp_liujg_dau_based where is_new="1" and imp_date>="20190301" group by qimei )
Select substring(imp_date,0,6) month,a.qimei,new_date from tmp_liujg_dau_based a left join tmp b on a.qimei=b.qimei group by substring(imp_date,0,6),a.qimei,new_date having substring(imp_date,0,6)>="201903";
image.png
3.计算2019年3月至今,每日的用户次日留存率,领取红包用户的次日留存,未领取红包用户的次日留存率
每日用户次日留存率
Select imp_date,qimei,LEAD(imp_date,1) OVER(partition by qimei order by imp_date) as second_imp_date,(case when LEAD(imp_date,1) OVER(partition by qimei order by imp_date)-imp_date=1 then 1 else null end ) liucun from tmp_liujg_dau_based where imp_date>="20190301" group by qimei,imp_date;
image.png
With tmp as (Select imp_date,qimei,LEAD(imp_date,1) OVER(partition by qimei order by imp_date) as second_imp_date,(case when LEAD(imp_date,1) OVER(partition by qimei order by imp_date)-imp_date=1 then 1 else null end ) liucun from tmp_liujg_dau_based where imp_date>="20190301" group by qimei,imp_date)
Select imp_date,sum(liucun)/count(qimei) from tmp group by imp_date;
image.png
领取红包用户的次日留存
With tmp as (Select imp_date,qimei,LEAD(imp_date,1) OVER(partition by qimei order by imp_date) as second_imp_date,(case when LEAD(imp_date,1) OVER(partition by qimei order by imp_date)-imp_date=1 then 1 else null end ) liucun from tmp_liujg_packed_based where imp_date>="20190301" group by qimei,imp_date)
Select imp_date,sum(liucun)/count(qimei) from tmp group by imp_date;
image.png
连接查询(存在问题 活跃用户表中的用户不一定存在于红包用户中)
select a.imp_date,
a.qimei,
b.qimei as hb_qimei,
(case when lead(imp_date,1) OVER(PARTITION BY a.qimei ORDER BY imp_date)-imp_date=1 then 1 else null end) as total_rank_times,
(case when lead(imp_date,1) OVER(PARTITION BY b.qimei ORDER BY imp_date)-imp_date=1 then 1 else null end) as hb_rank_times,
(case when b.qimei is null and lead(imp_date,1) OVER(PARTITION BY a.qimei ORDER BY imp_date)-imp_date=1 then 1 else null end) as no_hb_rank_times
from tmp_liujg_dau_based a left join tmp_liujg_packed_based b on a.imp_date=b.imp_date and a.qimei=b.qimei where a.imp_date>="20190301" group by a.imp_date,a.qimei,b.qimei ;
image.png
With tmp as (
select a.imp_date,
a.qimei,
b.qimei as hb_qimei,
(case when lead(imp_date,1) OVER(PARTITION BY a.qimei ORDER BY imp_date)-imp_date=1 then 1 else null end) as total_rank_times,
(case when lead(imp_date,1) OVER(PARTITION BY b.qimei ORDER BY imp_date)-imp_date=1 then 1 else null end) as hb_rank_times,
(case when b.qimei is null and lead(imp_date,1) OVER(PARTITION BY a.qimei ORDER BY imp_date)-imp_date=1 then 1 else null end) as no_hb_rank_times
from tmp_liujg_dau_based a left join tmp_liujg_packed_based b on a.imp_date=b.imp_date and a.qimei=b.qimei where a.imp_date>="20190301" group by a.imp_date,a.qimei,b.qimei )
Select imp_date,count(hb_qimei)/count(qimei) as per,
sum(total_rank_times)/count(qimei) as total_liucun,
sum(hb_rank_times)/count(hb_qimei) as hb_liucun,
sum(no_hb_rank_times)/count(qimei) as no_hb_liucun
from tmp group by imp_date;
image.png
4.计算2019年6月1日至今,每日新用户领取得第一个红包的金额
With tmp as (
Select a.imp_date,report_time,a.qimei,add_money,ROW_Number() over (partition by a.imp_date,a.qimei order by report_time) rank from tmp_liujg_packed_based a left join tmp_liujg_dau_based b on a.qimei=b.qimei and a.imp_date=b.imp_date where is_new="1")
Select imp_date,qimei,add_money from tmp where rank=1;
image.png
5.计算2019年3月1日至今,每个新用户领取的第一个红包和第二个红包的时间差(只计算注册当日有领取红包的用户,注册当日及以后的DAU表中新用户为1的用户)
Select a.imp_date,report_time,a.qimei,row_number() over(partition by a.imp_date,a.qimei order by report_time) rank ,lead(a.report_time,1) over(partition by a.imp_date,a.qimei order by report_time) rank_time from tmp_liujg_packed_based a left join tmp_liujg_dau_based b on a.imp_date=b.imp_date and a.qimei=b.qimei where is_new="1")
image.png
由于无timestampdiff函数计算两者分钟差,只能进行自我换算后的加减
6. image.png
Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei;
With tmp as (
Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei)
Select imp_date,count(qimei) from tmp where rank>=30 group by imp_date;
7. image.png
#第一步:对每天登陆进行编号
Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei;
#第二步:利用date_sub(imp_date,rank),查看是否一致(连续),然后进行分组 求(连续登陆天数)
Select qimei,imp_date,date_sub(imp_date,rank) as login_group,min(imp_date) as star_date1,max(imp_date) as end_date1,count(1) as continuous_days from
(Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei) a
Group by qimei,date_sub(imp_date,rank);
#第三步:按qimei分组,取max(continuous_days)
Select qimei,star_date1,end_date1,max(continuous_days) from (
Select qimei,imp_date,date_sub(imp_date,rank) as login_group,min(imp_date) as star_date1,max(imp_date) as end_date1,count(1) as continuous_days from
(Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei) a
) b group by qimei;
8. image.png
#第一步:对每天登陆进行编号
Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei;
#第二步:利用date_sub(imp_date,rank),查看是否一致(连续),然后进行分组 求(连续登陆天数)
Select qimei,imp_date,date_sub(imp_date,rank) as login_group,min(imp_date) as star_date1,max(imp_date) as end_date1,count(1) as continuous_days from
(Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei) a
Group by qimei,date_sub(imp_date,rank);
#第三步:求相邻时间差 第一条数据的end_date - 第二条数据的star_date
Select qimei,imp_date,date_sub(imp_date,rank) as login_group,min(imp_date) as star_date1,max(imp_date) as end_date1,count(1) as continuous_days,lead(min(imp_date),1,max(imp_date))-max(imp_date) as un_logintime from
(Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei) a
Group by qimei,date_sub(imp_date,rank);
#第四步:分组取max(un_logintime)
Select qimei,imp_date,max(un_logintime) from
(Select qimei,imp_date,date_sub(imp_date,rank) as login_group,min(imp_date) as star_date1,max(imp_date) as end_date1,count(1) as continuous_days,lead(min(imp_date),1,max(imp_date))-max(imp_date) as un_logintime from
(Select qimei,imp_date,row_number() over(partition by qimei order by imp_date) rank from tmp_liujg_packed_based group by imp_date,qimei) a
Group by qimei,date_sub(imp_date,rank)b)c group by qimei;