MySQL

Hive表血缘关系获取

2020-06-19  本文已影响0人  嘻嘻是小猪

hive血缘关系获取比较容易,方案也比较成熟。

org.apache.hadoop.hive.ql.tools.LineageInfo 表级别血缘

利用LineageInfo分析HiveQL中的表级别血缘关系

这个类中本身带main方法,可以独立启动,将sql语句作为参数传入

public static void main(String[] args) throws IOException, ParseException,
      SemanticException {

    String query = args[0];

    LineageInfo lep = new LineageInfo();

    lep.getLineageInfo(query);

    for (String tab : lep.getInputTableList()) {
      System.out.println("InputTable=" + tab);
    }

    for (String tab : lep.getOutputTableList()) {
      System.out.println("OutputTable=" + tab);
    }
  }

hive源码已给出很好的代码示例,直接用就行了。
要注意的是,源码对CREATE_TABLE_AS, LOAD这样的语法似乎并不支持
可根据org.apache.hadoop.hive.ql.tools.LineageInfo#process方法中代码增加自己的逻辑

org.apache.hadoop.hive.ql.hooks.LineageLogger 字段级别血缘

利用LineageLogger分析HiveQL中的字段级别血缘关系

这是hive提供的一个Hook,使用也很简单


我参照的是第二种方式,做了自己关于表级别的探索。
同时在LineageLogger 的源思路上做了些许改变

代码如下

pom依赖

<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-exec</artifactId>
    <version>2.3.4</version>
</dependency>
public class LineageHook implements ExecuteWithHookContext {

    private static final HashSet<String> OPERATION_NAMES = new HashSet<String>();
    private static final HashSet<String> INPUTS = new HashSet<String>();
    private static final HashSet<String> OUTPUTS = new HashSet<String>();

    static {
        OPERATION_NAMES.add(HiveOperation.QUERY.getOperationName());
        OPERATION_NAMES.add(HiveOperation.CREATETABLE_AS_SELECT.getOperationName());
        OPERATION_NAMES.add(HiveOperation.ALTERVIEW_AS.getOperationName());
        OPERATION_NAMES.add(HiveOperation.CREATEVIEW.getOperationName());
        OPERATION_NAMES.add(HiveOperation.LOAD.getOperationName());//在原有基础上,开放load语句
    } 

    public void run(HookContext hookContext) throws Exception {

        INPUTS.clear();
        OUTPUTS.clear();

        QueryPlan plan = hookContext.getQueryPlan();
        LineageCtx.Index index = hookContext.getIndex();
        SessionState ss = SessionState.get();
        if (ss != null && index != null
                && OPERATION_NAMES.contains(plan.getOperationName())
                && !plan.isExplain()) {

            System.out.println(plan.getOperationName());

            //输出
            for (WriteEntity output : plan.getOutputs()) {
                Entity.Type entityType = output.getType();
                if (entityType == Entity.Type.TABLE
                        || entityType == Entity.Type.PARTITION
                        || entityType == Entity.Type.LOCAL_DIR //放行LOCAL_DIR 
                        || entityType == Entity.Type.DFS_DIR //放行DFS_DIR  
                        ) {
                    Table trgTb = output.getTable();
                    String trgTbName = null;
                    if (trgTb!=null) {
                        trgTbName = trgTb.getDbName()+"."+trgTb.getTableName();
                    }else {
                        trgTbName = output.getD().toString();
                        //hdfs://master:8020/tmp/hive/admin/27808155-878a-4446-9c4e-a2f3388301fc/hive_2020-06-19_16-47-52_939_789950828629061887-1/-mr-10001
                        if (trgTbName.matches("hdfs://.+/tmp/hive/.+")) {// 过滤MR中间临时落地数据的路径
                            continue;
                        }
                    }
//                    System.out.println("target table "+trgTbName);l
                    if (OUTPUTS.contains(trgTbName)) {
                        continue;
                    }else {
                        OUTPUTS.add(trgTbName);
                    }
                    break;
                }
            }

            if (OUTPUTS.size()==0) {//如果没有输出,不获取输入,相当于屏蔽了无输出的简单Query
                return;
            }

            //输入
            for (ReadEntity input : plan.getInputs()) {
                Entity.Type entityType = input.getType();
                if (entityType == Entity.Type.TABLE
                        || entityType == Entity.Type.PARTITION
                        || entityType == Entity.Type.LOCAL_DIR
                        || entityType == Entity.Type.DFS_DIR
                        ) {
                    Table srcTb = input.getTable();

                    String srcTbName = null;
                    if (srcTb!=null) {
                        srcTbName = srcTb.getDbName()+"."+srcTb.getTableName();
                    }else {
                        srcTbName = input.getD().toString();
                        if (srcTbName.matches("hdfs://.+/tmp/hive/.+")) {
                            continue;
                        }
                    }
                    INPUTS.add(srcTbName);  //用HashSet装输入源名称,因为多分区输入时会有多个ReadEntity 这些Entity表名是相同的
//                    System.out.println("src table "+srcTbName);
                }
            }

            for (String input : INPUTS) {
                System.out.println("INPUT="+input);
            }

            for (String output : OUTPUTS) {
                System.out.println("OUTPUT="+output);
            }
        }
    }
}

实验开始

  1. Hive Cli开启
  2. set hive.exec.pre.hooks=LineageHook
insert overwrite table gdm.gdm_cus_tag_di partition (dt)
select tmp.user_id, tag_name, tmp.act_type, sum(tmp.cnt) as cnt, tmp.dt from 
(select a.user_id as user_id, b.tags as tags, 2 as act_type, a.match_cnt as cnt, a.dt as dt 
from fdm.fdm_cms_matches_da b, gdm.gdm_cus_match_di a where a.match_id = b.match_id and a.dt='2020-05-25'
union all
select a.user_id as user_id, b.tags as tags, 1 as act_type, a.game_cnt as cnt, a.dt as dt 
from fdm.fdm_cms_subgame_public_da b, gdm.gdm_cus_game_di a where a.game_id = b.game_id and a.dt='2020-05-25'
union all
select a.user_id as user_id, b.tags as tags, 3 as act_type, a.sign_cnt as cnt, a.dt as dt
from fdm.fdm_cms_matches_da b, gdm.gdm_cus_signup_di a where a.match_id = b.match_id and a.dt='2020-05-25'
union all
select a.user_id as user_id, b.tags as tags, 4 as act_type, a.cancel_cnt as cnt, a.dt as dt
from fdm.fdm_cms_matches_da b, gdm.gdm_cus_cl_signup_di a where a.match_id = b.match_id and a.dt='2020-05-25') tmp
lateral view explode(split(tmp.tags, ',')) tagtable as tag_name 
group by user_id, tag_name, act_type, dt;
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/hadoop/output' ROW FORMAT DELIMITED FIELDS TERMINATED by ',' select * from fdm.fdm_cms_matches_da; 
LOAD DATA INPATH '/user/hive/external/mongo_ipt/relation/follow_num/follow_num_sum.csv'
OVERWRITE INTO table bdm.bdm_relation_follow_num_sum_di partition(dt='2020-06-19');

暂时测试几个典型的用例


谢谢本文被我参考的大神

收工!!!

上一篇下一篇

猜你喜欢

热点阅读