用python 实现 hive sql join和row_num
给定一个查询时间,找最近登录的用户
create temporary function row_number as "com.ai.hive.udf.util.RowNumberUDF";
select logint_time,username from
( select ip,login_time,username from (
select ip ,select_time from a join select ip,login_time,username from b on(a.ip=b.ip and a.login_time
)t sort by login_time desc )p where row_number=1
import pandas as pd
import numpy as np
login_column_names = ['ip','dip','type','uri','time']
select_column_names = ['ip','dip','action','time']
df_login = pd.read_csv('login.txt',sep='\t',encoding='utf-8',header=None,names=login_column_names)
df_select = pd.read_csv('select.txt',sep='\t',encoding='utf-8',header=None,names=select_column_names)
df_login[['username','password','authPassword','submit']]= df_login['uri'].str.replace('j_username=','').str.replace('password=','').str.split('&',expand=True)
df_login
x=[1,2,3,6,7,8]
df_login.drop(df_login.columns[x], axis=1, inplace=True)
df_login
import time
# time.strptime(df_login['time'],"%Y-%m-%d %H:%M:%S")
df_login['time']
df_login['new_time']=0
a = 0
for x in df_login['time']:
print(int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f"))))
df_login['new_time'][a] = int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f")))
a=a+1
df_login['new_time']
import time
# time.strptime(df_login['time'],"%Y-%m-%d %H:%M:%S")
df_select['time']
df_select['new_time']=0
a = 0
for x in df_select['time']:
print(int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f"))))
df_select['new_time'][a] = int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f")))
a=a+1
df_select
df_on = df_select.merge(df_login,how='left',on=['ip'])
df_on
df_on['diff_time'] = df_on['new_time_x']-df_on['new_time_y']
df_on
df_on[df_on['diff_time']>=0]
#找时间最小的那个
df_on = df_on[df_on['diff_time']>=0]
df_on
df_on[['ip','time_x','username','diff_time']]
def min_time(df,n=3,column='diff_time'):
return df.sort_index(by=column,ascending=False)[-n:]
df_on[['ip','time_x','username','diff_time']].groupby(['ip','time_x']).apply(min_time,n=1)