Sqoop抽取数据后核对数据准确性
Sqoop从MySQL抽取数据,因为数据有更新(新增和更新在Hive中各对应一条记录),所有Hive中的数据会比MySQL数据多,数据是按trans_date分区的,但是trans_date会变化,所以即使去重后Hive中的数据还是会比MySQL多,如何找出这些数据呢?
- 新建表order、order1、order2
create table order(
trans_id string,
trans_date timestamp,
create_time timestamp,
update_time timestamp
)row format delimited fields terminated by '\t';
- 抽取Hive中数据到order
insert into table order select trans_id,trans_date,create_time,update_time from ( select trans_id ,trans_date,create_time,update_time, row_number() over(distribute by trans_id sort by update_time) as rn from default.main_order where day = '2017-08-23') t where t.rn=1;
- 抽取MySQL的数据到order2
sqoop import --connect jdbc:mysql://host:3306/database?tinyInt1isBit=false --username username --password password --query 'SELECT trans_id,trans_date,create_time,update_time from today_main_order WHERE $CONDITIONS and substr(trans_date,1,10) = "2017-08-23" ' --split-by trans_id --hive-import -m 5 --delete-target-dir --hive-database wjk --hive-table order1 --target-dir /user/hadoop --fields-terminated-by '\t' --lines-terminated-by '\n' --hive-drop-import-delims --null-string '\\N' --null-non-string '\\N'
- 获取结果
insert into table order2 select a.trans_id,a.trans_date,a.create_time,a.update_time from order a left outer join (select * from order1) b on (a.trans_id = b.trans_id) where b.trans_id is null;