构建clickhouse复杂数据模型
2019-09-19 本文已影响0人
郭彦超
智能路径
- 输入,在数据范围内指定结束事件与窗口大小
- 返回,按用户访问时间由小到大排序后的路径字符串
select
d_i,
arrayStringConcat(
arrayMap(
b - > tupleElement(b, 1),
arraySort(
y - > tupleElement(y, 2),
arrayFilter(
(x, y, z) - > toDateTimeOrZero(z) - toDateTimeOrZero(y) < 1000,
arrayMap(
(x, y) - > (x, y),
groupArray(e_t),
groupArray(time)
),
groupArray(time),
arrayWithConstant(
length(groupArray(time)),
maxIf(time, e_t = 'launch')
)
)
)
),
'->'
) path
from
bw.scene_tracker
where
d_i <> ''
group by
d_i
例子
上述例子窗口大小为1000s,结束事件 “launch”; 亿级数据妙出
- 简版
select path,count(1)cn from(
select uid, maxIf( ts, url = 'https://ark.analysys.cn/browseGoods' ) as maxTime,
arrayFilter(x->maxTime-x.1<60*30*1000 and maxTime>=x.1 , groupArray( (ts,url) )) as window_data,
arraySort(x->x.1,window_data) as sort_data,
arrayStringConcat(sort_data.2,'->') as path
from bw.session group by uid ) where path<>'' group by path order by cn desc limit 11
- 对路径中相邻页面重复的数据进行去重
select path,count(1)cn from(
select uid, maxIf( ts, url = 'https://ark.analysys.cn/browseGoods' ) as maxTime,
arrayFilter(x->maxTime-x.1<60*30*1000 and maxTime>=x.1 , groupArray( (ts,url) )) as window_data,
arraySort(x->x.1,window_data) as sort_data,
arrayFilter((x,y)->x<>sort_data[y-1].2 ,sort_data.2,arrayEnumerate( sort_data )) as sort_data_url, --相邻去重
arrayStringConcat(sort_data_url,'->') as path
from bw.session group by uid ) where path<>'' group by path order by cn desc limit 11
- 线上环境测试版
select data, count(1) cn from (
with maxIf( c_t , cat='page_view'and act='页面_浏览') as max_time, -- 目标事件时间
arraySort(
e -> e.1,
arrayFilter(x->x.1<=toUInt64OrZero(max_time),groupArray((toUInt64OrZero(c_t), (cat,act) )))
) as sorted_array,
-- 按时间排序后的数据
arrayPushFront( sorted_array, sorted_array[1] ) as e_arr,
arrayFilter(
(i, e,z) -> z.1 < toUInt64OrZero(max_time)
and (e > 1800000 or (z.2.1='page_view' and z.2.2='页面_浏览')),
arrayEnumerate(e_arr), arrayDifference( e_arr.1 ),e_arr
) as arr_indx, -- 过滤目标事件、时间差后的数据
arrayReduce('max',arr_indx) as smIndx,
arrayFilter(
(e,i) -> i>=smIndx and e.1<=toUInt64OrZero(max_time) ,
sorted_array, arrayEnumerate(sorted_array)
) as data_
select u_i,
arrayFilter((x,y)-> y<>0 ,data_.2,arrayDifference(arrayEnumerateDense(data_.2))) as data__,
arraySlice(data__,length(data__)-8,8 ) as data,
-- arrayStringConcat(data,'->') as path,
hasAll(data, [ ('page_view','页面_浏览') ]) as has_way_point
from app.scene_tracker where c_p='PC' and length(u_i)>20
group by u_i having length(data)>1
) tab
where has_way_point=1 group by data order by cn desc limit 1000
易观 OLAP Session分析
http://ds.analysys.cn/2019/session.html
1、计算默认session每天的会话次数、人均访问时长、退出率
SELECT
day,
countDistinct(sid) AS scn,
countDistinct(uid) AS ucn,
sum(t2 - t1) / ucn AS dur,
countIf(t1 = t2) / scn AS t_rate
FROM
(
SELECT
day,
uid,
sid,
min(ts2) AS t1,
max(ts2) AS t2
FROM bw.session2
GROUP BY
day,
uid,
sid
)
GROUP BY day
ORDER BY day ASC
2、根据动态session计算每日会话次数
- 第一版
# 4s
select
day,
sum(length(sessions))
from
(
select
day,
arrayFilter(
(y, z) - > dateDiff(
'minute',
toDateTimeOrZero(y),
toDateTimeOrZero(z)
) > 30,
arraySort(x - > x, groupArray(time)),
arrayPushBack(
arrayPopFront(arraySort(x - > x, groupArray(time))),
'2029-09-08 23:21:30'
)
) sessions
from
bw.scene_tracker
where
d_i <> ''
group by
day,
d_i
)
where
length(sessions) > 0
group by
day
- 第二版
# 4s
select day,sumArray(arrayFilter((z,x,y)->dateDiff( 'minute', toDateTimeOrZero(x) ,toDateTimeOrZero(y)) >30,tupleElement(sessions,3),tupleElement(sessions,1), tupleElement(sessions,2)) ) from (
select day, arrayMap((x,y) -> ( x,y,1),
arraySort(groupArray(time)),
arrayPushBack(arrayPopFront(arraySort(x->x, groupArray(time))),'2029-09-08 23:21:30')) sessions
from bw.scene_tracker where d_i<>'' group by day, d_i ) group by day
- 第三版
# 4s
select day,sum(length(arrayFilter((x,y)->dateDiff( 'minute', toDateTimeOrZero(x) ,toDateTimeOrZero(y)) >30, t1, t2)) ) from (
select day,
arraySort(x->x, groupArray(time )) t1 ,
arrayPushBack(arrayPopFront( t1 ),'2029-09-08 23:21:30') t2
from bw.scene_tracker where d_i<>'' group by day, d_i ) group by day limit 1111
- 终结版
# 使用超时时间30分钟+跨天的session切割规则,计算出20190501-20190510,每天的会话次数
select day ,sumArray( sessions ) from (
select day, arrayMap(( y,z) -> if(dateDiff( 'minute', y ,z)>30,1,0) ,
arraySort(groupArray( ts2)) as t1,
arrayPushBack(arrayPopFront(t1) ,addYears(now(),1000))) sessions
from bw.session2 group by day, uid ) where length(sessions)>0 group by day
- 简化版
select day ,sum(length( sessions )) from (
select day, arrayFilter(x -> x>30*60*1000,
arrayDifference(arraySort( groupArray( ts )))) sessions
from bw.session2 group by day, uid ) group by day
- 整合版
select day , sum(arrayUniq(psarray)) pscn, sum(arrayUniq(psarray)-length(intersect)) /sum(length(idxx)) as t_rate from (
select day,
arraySort(x->x.2, groupArray( (url,ts,event_code )))as cur,
arrayFilter( (x,y,z)-> y>30*60*1000 or z='$start_event' , range( length(cur)) , arrayDifference(cur.2) ,cur.3) as idx ,
arrayPushBack(idx ,length(cur)) as idx2,
arrayEnumerate(idx2) as idxx,
flatten(arrayMap((x,y)->arrayResize([concat(toString(x),toString(uid)) ], x-idx2[y-1], concat(toString(x),toString(uid)) ), idx2, idxx ) ) as tkarray ,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psarray ,
arrayFilter((x,y)-> y<>'https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psother,
arrayIntersect(psarray,psother) intersect
from bw.session2 group by day, uid ) group by day
- 另一种实现 性能一般般
# 结果与上面一致
select day , sumArray(tp.3) pscn, sumArray(tp.2) /sum(length(tp)) as trate from (
select day,
arraySort(x->x.2, groupArray( (url,ts )))as cur,
arrayFilter( (x,y )-> y>30*60*1000 , range( length(cur)) , arrayDifference(cur.2) ) as idx ,
arrayPushBack(idx ,length(cur)) as idx2,
arrayEnumerate(idx2) as idxx,
arrayMap((x,y)-> (arraySlice(cur.1, idx2[y-1]+1, x-idx2[y-1]) as session, if(hasAny(session,['https://ark.analysys.cn/browseGoods']) and arrayUniq(session)=1,1,0) as depth, hasAny(session,['https://ark.analysys.cn/browseGoods']) as pscn), idx2, idxx) tp
from bw.session2 group by day, uid ) group by day
2、根据动态session计算每日着陆页的跳出率
跳出率=访问了一个页面的Session数/总的Session数
- 第一版
select day ,sumArray( sessions.1 )/sumArray( sessions.2 ) from (
select day, arrayMap(( x,y,z) -> (if(dateDiff( 'minute', y.2 ,z.2)>30 and endsWith(y.1,'index'),1,0) ,if(dateDiff( 'minute', y.2 ,z.2)>30,1,0)) ,
arrayMap((x,y)->(x,y),groupArray( url ), groupArray( ts2))as data,
arraySort(x->x.2, data) as t1,
arrayPushBack(arrayPopFront(t1) ,('',addYears(now(),1000)))
) sessions
from bw.session2 group by day, uid ) group by day
3、 使用超时时间30分钟+跨天+指定开始事件,的session切割规则计算出20190501-20190510,每天包含某个页面行为的会话总数,人均访问深度。
- 第一版
--每天包含某个页面行为的会话总数
select day , sum(arrayUniq(cn)) from (
select day,
arraySort(x->x.2,arrayMap((x,y,z)->(x,y,z),groupArray( url ), groupArray( ts2 ), groupArray( event_code )))as cur,
arrayPushBack(arrayPopFront(cur) ,('', addYears(now(),1000) ,'')) as next,
arrayEnumerate(cur) as inx,
arrayFilter( (x,y,z)-> dateDiff( 'minute', y.2 ,z.2)>30 or z.3='$start_event' , inx, cur, next) as idx,
arrayEnumerate(idx) as idxx,
flatten(arrayMap((x,y)->arrayWithConstant(x-idx[y-1],x), idx, idxx) ) as tkarray ,
arrayResize( tkarray, length( cur ), length( cur )) as narray,
arrayFilter((x,y)-> y.1='https://ark.analysys.cn/browseGoods' , narray, cur) cn
from bw.session2 group by day, uid ) group by day
--人均访问深度
select day , sum(arrayUniq(cn)) pcn,sum(length(idx)) scn from (
select day,
arraySort(x->x.2,arrayMap((x,y,z)->(x,y,z),groupArray( url ), groupArray( ts2 ), groupArray( event_code )))as cur,
arrayPushBack(arrayPopFront(cur) ,('', addYears(now(),1000) ,'')) as next,
arrayEnumerate(cur) as inx,
arrayFilter( (x,y,z)-> dateDiff( 'minute', y.2 ,z.2)>30 or z.3='$start_event' , inx, cur, next) as idx,
arrayEnumerate(idx) as idxx,
flatten(arrayMap((x,y)->arrayWithConstant(x-idx[y-1],x), idx, idxx) ) as tkarray ,
arrayResize( tkarray, length( cur ), length( cur )) as narray,
arrayMap((x,y)-> concat(y.1,'$$',toString( x)) , narray, cur) cn
from bw.session2 group by day, uid ) group by day
- 第二版(整合版)
-- 第三题 简版
select day , sum(arrayUniq(psarray)) pscn, sum(arrayUniq(deptharray)) /sum(length(idxx)) as avgdepth from (
select day,
arraySort(x->x.2, groupArray( (url,ts,event_code )))as cur,
arrayFilter( (x,y,z)-> y>30*60*1000 or z='$start_event' , range( length(cur)) , arrayDifference(cur.2) ,cur.3) as idx ,
arrayPushBack(idx ,length(cur)) as idx2,
arrayEnumerate(idx2) as idxx,
flatten(arrayMap((x,y)->arrayWithConstant(x-arrayElement(idx2, y-1), concat(toString(x),toString(uid)) ), idx2, idxx ) ) as tkarray ,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psarray,
arrayMap((x,y)-> concat(y,'$$', x ) , tkarray, cur.1) as deptharray
from bw.session2 group by day, uid ) group by day
- 第三版
# 用 arrayResize 函数替代 arrayWithConstant 性能提升2s
select day , sum(arrayUniq(psarray)) pscn, sum(arrayUniq(deptharray)) /sum(length(idxx)) as avgdepth from (
select day,
arraySort(x->x.2, groupArray( (url,ts,event_code )))as cur,
arrayFilter( (x,y,z)-> y>30*60*1000 or z='$start_event' , range( length(cur)) , arrayDifference(cur.2) ,cur.3) as idx ,
arrayPushBack(idx ,length(cur)) as idx2,
arrayEnumerate(idx2) as idxx,
flatten(arrayMap((x,y)->arrayResize([concat(toString(x),toString(uid)) ], x-idx2[y-1], concat(toString(x),toString(uid)) ), idx2, idxx ) ) as tkarray ,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psarray,
arrayMap((x,y)-> concat(y, x ) , tkarray, cur.1) as deptharray
from bw.session2 group by day, uid ) group by day
- 另一种解题思路
select day , sum(pacn) pscn, sum(cl) /sum(asm) as avgdepth from (
select day,
arraySort(x->x.2, groupArray( (url,ts )))as cur,
-- arrayMap( (y,z)-> if(y>30*60*1000 or z='https://ark.analysys.cn/startUp' ,1,0) , arrayDifference(cur.2) ,cur.1) as idx ,
arrayMap((y,z)-> if(y>30*60*1000 or z='https://ark.analysys.cn/startUp' ,1,0), arrayDifference(cur.2) ,cur.1) amap,
arrayCumSum(amap) as rsid,
arraySum(amap )+1 asm,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , rsid, cur.1) as psarray,
arrayUniq( psarray ) as pacn,length(cur) as cl
from bw.session3 group by day, uid ) group by day
建表语句
CREATE TABLE bw.session2 (`uid` Int64, `ts` UInt64, `event_code` String, `sid` String, `url` String, `platform` String, `source` String, `city` String, `brand` String, `buy_count` Int32, `price` Float64, `day` LowCardinality(String), `ts2` DateTime, `uid2` LowCardinality(String)) ENGINE = MergeTree PARTITION BY tuple() ORDER BY (day, uid2, ts2) SETTINGS index_granularity = 8192