我爱编程

用python 实现 hive sql join和row_num

2018-04-13  本文已影响0人  思索人生_mind

给定一个查询时间,找最近登录的用户

create temporary function row_number as "com.ai.hive.udf.util.RowNumberUDF";

select logint_time,username from 

( select ip,login_time,username from (

select ip ,select_time from a join select ip,login_time,username from b on(a.ip=b.ip and a.login_time

)t sort by login_time desc )p where row_number=1

import pandas as pd

import numpy as np

login_column_names = ['ip','dip','type','uri','time']

select_column_names = ['ip','dip','action','time']

df_login = pd.read_csv('login.txt',sep='\t',encoding='utf-8',header=None,names=login_column_names)

df_select = pd.read_csv('select.txt',sep='\t',encoding='utf-8',header=None,names=select_column_names)

df_login[['username','password','authPassword','submit']]= df_login['uri'].str.replace('j_username=','').str.replace('password=','').str.split('&',expand=True)

df_login

x=[1,2,3,6,7,8]

df_login.drop(df_login.columns[x], axis=1, inplace=True)

df_login

import time

# time.strptime(df_login['time'],"%Y-%m-%d %H:%M:%S")

df_login['time']

df_login['new_time']=0

a = 0

for x in df_login['time']:

    print(int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f"))))

    df_login['new_time'][a] =  int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f")))

    a=a+1

df_login['new_time']

import time

# time.strptime(df_login['time'],"%Y-%m-%d %H:%M:%S")

df_select['time']

df_select['new_time']=0

a = 0

for x in df_select['time']:

    print(int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f"))))

    df_select['new_time'][a] =  int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f")))

    a=a+1

df_select

df_on = df_select.merge(df_login,how='left',on=['ip'])

df_on

df_on['diff_time'] = df_on['new_time_x']-df_on['new_time_y']

df_on

df_on[df_on['diff_time']>=0]

#找时间最小的那个

df_on = df_on[df_on['diff_time']>=0]

df_on

df_on[['ip','time_x','username','diff_time']]

def min_time(df,n=3,column='diff_time'):

    return df.sort_index(by=column,ascending=False)[-n:]

df_on[['ip','time_x','username','diff_time']].groupby(['ip','time_x']).apply(min_time,n=1)

上一篇 下一篇

猜你喜欢

热点阅读