HiveQL MR基本操作
2017-06-13 本文已影响79人
天堂宝宝_V
emplyee.txt 文件内容,中间为tab分割,utf8文件
1 张三 test 4000 500 2014-05-16 8 1
2 李四 coder 8000 1000 2015-08-12 3 2
3 王五 pm 4000 1500 2014-05-16 4 4
4 赵六 cto 30000 1000 2015-11-12 5 2
5 田七 ceo 40000 2014-05-16 4
6 蔡八 cfo 35000 1000 2015-08-12 5 4
7 孙久 pe 4500 500 2013-05-16 4 4
8 张魁 pm 10000 1000 2015-08-12 4 4
9 小梦 test 3000 500 2014-05-16 8 1
10 小黄 coder 9000 1000 2015-10-12 3 2
11 小明 coder 12000 500 2017-05-16 3 2
12 小红 coder 15000 1000 2016-04-12 3 2
13 小花 test 1000 500 2014-05-16 8 1
14 小兰 coder 8000 1000 2015-03-12 3 2
15 小宋 hr 17500 2500 1999-08-12 3 3
16 小江 hr 7500 1000 2009-08-12 3 3
17 marvis reps 9500 2000 2005-08-12 0 13
18 liming pa 2500 0 11
19 xiaosun pa 7500 1000 2009-10-12 0 12
20 zhangmi se 2000 2005-12-12 0 10
department.txt,中间为tab分割,utf8文件
1 测试部 郑州经开区
2 开发部 郑州经开区
3 人事部 深圳南山区
4 总经办 郑州金水区
5 财务部 深圳南山区
6 销售部 深圳南山区
7 aftersail beijing
8 beforesail beijing
HiveQL 创建表删除表,导入数据,查看数据 ,
注意导入的文件需要使用utf8编码
创建员工表
drop table if exists emplyee;
create table if not exists emplyee(
emp_code bigint
,emp_name string
,status string
,salary decimal(20,2)
,status_salary decimal(20,2)
,work_begin_date string
,parent_code bigint
,belong_dep_code bigint
)
row format delimited fields terminated by '\t'
STORED AS textfile;
describe emplyee;
-- 创建部门表
drop table if exists department;
create table if not exists department(
dep_code bigint
,dep_name string
,dep_address string
)
row format delimited fields terminated by '\t'
stored as textfile;
describe department;
show tables;
load data local inpath '/root/Desktop/emplyee.txt' overwrite into table emplyee;
load data local inpath '/root/Desktop/department.txt' overwrite into table department;
select * from emplyee;
select * from department;
select to_date(work_begin_date) from emplyee;
-- insert into 语法的使用
drop table if exists emplyee5000;
create table emplyee5000 like emplyee;
insert into emplyee5000
select * from emplyee where salary>5000;
select * from emplyee5000;
show functions;
gropu by
-- 统计 status_salary 不为null的有多少个。注意cout 不统计 null的特点
select count(status_salary) from emplyee;
-- 统计有多少个人职位为coder
select count(1) from emplyee where status='coder';
-- 计算员工总薪水,平均薪水,最高工资,最低工资。totalSalary 不能直接使用
select count(1) as emp_num
,sum(salary) as totalSalary
,sum(salary)/count(1) avg
,max(salary) max
,min(salary) min
from emplyee;
-- 计算部门的最高薪水,最低薪水,平均薪水,总薪水,总人数;
-- 写法特点是先写group by对应的名称,然后直接跟在select 后面首位,后面跟的必须都是聚合函数
select belong_dep_code
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code
-- 计算不显示部门code要显示部门名称
select a.dep_name,b.emp_count,b.max,b.min,b.avg,b.sum from department a,
(select belong_dep_code code
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code) b
where a.dep_code = b.code
-- 计算岗位的最高薪水,最低薪水,平均薪水,总薪水,总人数;
select status
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where status is not null
group by status
select * from emplyee where belong_dep_code in(
select dep_code from department where dep_address like '郑州%');
-- 计算岗位的最高薪水,最低薪水,平均薪水,且平均薪水大于7500;
-- 第一种写法
select * from(
select status
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where status is not null
group by status
)a
where a.avg > 7500
-- 第二种写法
select status
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where status is not null
group by status
having avg(salary) > 7500
select count(1) from emplyee;
-- 查询出最高薪水的人的信息
select *
from emplyee
where salary in (select max(salary) from emplyee)
-- 计算平均薪水放入avg_salary 表(拆分法)
create table avg_salary as
select avg(salary) as salary from emplyee;
select * from avg_salary;
-- 把 emplyee 和avg_salary 笛卡尔乘积
create table emplyee_with_avgsalary as
select a.* ,b.salary as avg_salary
from emplyee a,avg_salary b;
select * from emplyee_with_avgsalary
where salary > avg_salary;
-- 汇总
select a.*,b.*
from emplyee a,
(select avg(salary) as avg_salary from emplyee)b
where a.salary>b.avg_salary
-- 查询总经办下面的岗位有哪些 in写法
select distinct status from emplyee
where belong_dep_code in (
select dep_code from department where dep_name='总经办'
);
-- 查询总经办下面的岗位有哪些 exists写法
select distinct status from emplyee a
where exists (
select 1 from department b where a.belong_dep_code = b.dep_code and dep_name='总经办'
);
select * from department;
-- 薪水大于8000或者小于2000或者等于5000的员工 union写法
explain -- 查询执行流程
select * from emplyee where salary > 8000
union all
select * from emplyee where salary < 2000
union all
select * from emplyee where salary = 5000
explain
select * from emplyee where salary > 8000 or salary < 2000 or salary = 5000
hive join
describe emplyee;
describe department;
select * from emplyee;
select * from department;
load data local inpath '/root/Desktop/emplyee.txt' overwrite into table emplyee;
load data local inpath '/root/Desktop/department.txt' overwrite into table department;
-- 列出员工的姓名和所在的部门的名称和地址 内连接
select a.emp_code,a.emp_name,b.dep_name,b.dep_address
from emplyee a
join department b
on a.belong_dep_code = b.dep_code
-- 左联接
select a.emp_code, a.emp_name,b.dep_name,b.dep_address
from emplyee a
left join department b
on a.belong_dep_code = b.dep_code
-- 列出所有员工的姓名和他上司的姓名
select a.emp_name,b.emp_name from emplyee a
join emplyee b
on a.parent_code=b.emp_code
-- 列出员工的姓名和他所在的部门,把没有员工的部门也列举出来
-- left join后面的where 必须是主表的查询条件。副表的条件可以跟到附表连接on的后面,
-- 否则加到where上则该左联接失效会变成内连接
select b.emp_name,a.dep_name from department a
left join emplyee b
on a.dep_code = b.belong_dep_code
-- 哪些部门没有员工
select b.emp_name,a.dep_name from department a
left outer join emplyee b
on a.dep_code = b.belong_dep_code
where b.belong_dep_code is null
-- 哪些部门没有员工
select distinct * from department a
where not exists (
select 1 from emplyee b where b.belong_dep_code = a.dep_code
);
-- 查询所有部门的部门编号,部门名称,部门地址,员工人数,月薪水支出,人均薪水
explain
select c.dep_code
,c.dep_name
,c.dep_address
,b.cnt
,b.total
,b.average
from department c
left join (select a.belong_dep_code as dep_code
,count (*) cnt
,sum(salary) total
,avg(salary) average
from emplyee a
group by a.belong_dep_code)b
on c.dep_code=b.dep_code;
explain
select a.dep_code
,a.dep_name
,a.dep_address
,count(b.emp_code)
,sum(b.salary)
,avg(b.salary)
from department a
left join emplyee b
on a.dep_code=b.belong_dep_code
group by a.dep_code
,a.dep_name
,a.dep_address
-- 查询所有员工的员工编号,姓名,薪水,部门名称,上司姓名
explain
select c.code
,c.name
,c.salary
,d.dep_name
,c.super
from department d,
(select a.emp_code as code
,a.emp_name as name
,a.salary as salary
,b.emp_name as super
,a.belong_dep_code as dep_code
from emplyee a
join emplyee b
on a.parent_code=b.emp_code)c
where d.dep_code=c.dep_code
order by c.code
-- 查询所有员工的员工编号,姓名,薪水,部门名称,上司姓名
explain
select a.emp_code
,a.emp_name
,a.salary
,c.dep_name
,b.emp_name
from emplyee a
left join emplyee b
on a.parent_code=b.emp_code
left join department c
on a.belong_dep_code=c.dep_code
order by a.emp_code
-- 每个部门最高薪水的人
select a.*
,c.dep_name
from emplyee a
join (select belong_dep_code as dep_code
,max(salary) max_salary
from emplyee
group by belong_dep_code)b
on a.belong_dep_code = b.dep_code and a.salary=b.max_salary
inner join department c
on c.dep_code=b.dep_code
hive 分组
-- 统计 status_salary 不为null的有多少个。注意cout 不统计 null的特点
select count(status_salary) from emplyee;
-- 统计有多少个人职位为coder
select count(1) from emplyee where status='coder';
-- 计算员工总薪水,平均薪水,最高工资,最低工资。totalSalary 不能直接使用
select count(1) as emp_num
,sum(salary) as totalSalary
,sum(salary)/count(1) avg
,max(salary) max
,min(salary) min
from emplyee;
-- 计算部门的最高薪水,最低薪水,平均薪水,总薪水,总人数;
-- 写法特点是先写group by对应的名称,然后直接跟在select 后面首位,后面跟的必须都是聚合函数
select belong_dep_code
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code
-- 计算不显示部门code要显示部门名称
select a.dep_name,b.emp_count,b.max,b.min,b.avg,b.sum from department a,
(select belong_dep_code code
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code) b
where a.dep_code = b.code
-- 计算岗位的最高薪水,最低薪水,平均薪水,总薪水,总人数;
select status
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where status is not null
group by status
select * from emplyee where belong_dep_code in(
select dep_code from department where dep_address like '郑州%');
-- 计算岗位的最高薪水,最低薪水,平均薪水,且平均薪水大于7500;
-- 第一种写法
select * from(
select status
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where status is not null
group by status
)a
where a.avg > 7500
-- 第二种写法
select status
,max(salary) max
,min(salary) min
,avg(salary) avg
,sum(salary) sum
,count(1) emp_count
from emplyee
where status is not null
group by status
having avg(salary) > 7500
select count(1) from emplyee;
-- 查询出最高薪水的人的信息
select *
from emplyee
where salary in (select max(salary) from emplyee)
-- 计算平均薪水放入avg_salary 表(拆分法)
create table avg_salary as
select avg(salary) as salary from emplyee;
select * from avg_salary;
-- 把 emplyee 和avg_salary 笛卡尔乘积
create table emplyee_with_avgsalary as
select a.* ,b.salary as avg_salary
from emplyee a,avg_salary b;
select * from emplyee_with_avgsalary
where salary > avg_salary;
-- 汇总
select a.*,b.*
from emplyee a,
(select avg(salary) as avg_salary from emplyee)b
where a.salary>b.avg_salary
-- 查询总经办下面的岗位有哪些 in写法
select distinct status from emplyee
where belong_dep_code in (
select dep_code from department where dep_name='总经办'
);
-- 查询总经办下面的岗位有哪些 exists写法
select distinct status from emplyee a
where exists (
select 1 from department b where a.belong_dep_code = b.dep_code and dep_name='总经办'
);
select * from department;
-- 薪水大于8000或者小于2000或者等于5000的员工 union写法
explain -- 查询执行流程
select * from emplyee where salary > 8000
union all
select * from emplyee where salary < 2000
union all
select * from emplyee where salary = 5000
explain
select * from emplyee where salary > 8000 or salary < 2000 or salary = 5000
hive 分区 桶
-- 在Hive Select查询中一般会扫描整个表内容,会消耗很多时间做没必要的工作。
-- 有时候只需要扫描表中关心的一部分数据,因此建表时引入了partition概念。
-- 分区表指的是在创建表时指定的partition的分区空间。?
create table if not exists order_yyyymmdd(
order_id bigint
,order_time string
,order_money decimal(20,2)
,user_id bigint
,address_id bigint
,order_type int
)
partitioned by(order_date string)
row format delimited FIELDS TERMINATED BY ' ';
alter table order_yyyymmdd
add partition(order_date= '20170613');
alter table order_yyyymmdd
add partition(order_date ='20170612');
alter table order_yyyymmdd
add partition(order_date ='20170610')
partition(order_date ='20170609')
show partitions order_yyyymmdd;
alter table order_yyyymmdd drop partition(order_date='20170609');
describe order_yyyymmdd;
-- load 数据的时候必须该数据符合分区的条分区条件自身本不会对数据的合法性进行校验。需要进行抽取清洗
load data local inpath '/root/Desktop/order.txt' overwrite into table order_yyyymmdd partition(order_date= '20170614')
select * from order_yyyymmdd where order_date='20170614';
-- 创建临时分区表,存储临时数据,用于对数据进行筛选
create table if not exists order_yyyymmdd_tmp
like order_yyyymmdd;
load data local inpath '/root/Desktop/order.txt' overwrite into table order_yyyymmdd_tmp partition(order_date='20170613');
show partitions order_yyyymmdd_tmp;
select * from order_yyyymmdd_tmp;
-- 提前删除一次分区,支持重跑
alter table order_yyyymmdd drop partition(order_date='20170613');
insert into order_yyyymmdd partition(order_date='20170613')
select order_id
,order_time
,order_money
,user_id
,address_id
,order_type
from order_yyyymmdd_tmp
where order_date='20170613'
and order_time like '2017-06-13_%';
select * from order_yyyymmdd where order_date='20170613';
alter table order_yyyymmdd_tmp drop partition(order_date='20170613');
-- 创建分区加桶表
drop table if exists order_yyyymmdd_b;
create table order_yyyymmdd_b(
order_id bigint
,order_time string
,order_money decimal(20,2)
,user_id bigint
,address_id bigint
,order_type int
)
PARTITIONED BY(order_date string)
CLUSTERED BY (user_id) sorted by(user_id) into 2 BUCKETS
row format delimited FIELDS TERMINATED BY ' ';
describe order_yyyymmdd_b;
-- 新版本已经被废弃
set hive.enforce.bucketing = true;
alter table order_yyyymmdd_b drop partition(order_date='20170613');
insert into order_yyyymmdd_b partition(order_date='20170613')
select order_id
,order_time
,order_money
,user_id
,address_id
,order_type
from order_yyyymmdd
where order_date='20170613';
select * from order_yyyymmdd_b
where order_date='20170613';
hive Order By、Sort By、Cluster By、Distribute By 与函数
-- order只能默认进行全排序,不能对多个节点,不符合语法规则
select *
from employee
order by salary desc
,status_salary desc;
-- 设置reducer个数
set mapred.reduce.tasks=2;
-- 显示最大的reducer数量
show conf 'hive.exec.reducers.max'
-- sort 是指单个reducer节点排序
drop table if exists sort_by;
create table sort_by
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
as select *
from employee
sort by salary desc;
select * from sort_by;
-- 根据status进行分区,同样的status会被分配到同一个区域,reduce个数大于1
drop table if exists distribute_test;
create table distribute_test
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
as select *
from employee
distribute by status;
-- 按照职员的职位,将其工资由高到底排序
drop table if exists dstest;
create table dstest
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
as select *
from employee
distribute by status -- 首先根据status将数据分配到不同的reduce节点,
sort by status,salary; -- 然后在根据status,salary对各个节点的数据进行排序,所有节点的数据组合起来不是全排序
--cluster by 就是distribute by 与sort by的结合,但是仅限字段一致的情况
select *
from employee
cluster by status;
show functions;
--描述if方法的特点
describe function if;
describe function EXTENDED if;
-- map类型
select map(emp_name,status)
from employee;
select am['李四']
from
(select map(emp_name,status)am
from employee)b;
--复杂类型数组
select aa[0]
,am[1]
from(
select array(emp_name,status)aa
from employee)b
-- 复杂类型struct
select a.persion.code
,a.persion.name
from (select named_struct('code',emp_code,'name',emp_name) persion
from employee) a;
select emp_name
,if(status_salary is null,'普通岗位','带薪岗位')
from employee
select * from employee;
-- 2.2.0版本才开始支持,目前版本暂时不支持
SELECT emp_name||status
FROM employee;
-- 类型转换
select cast (work_begin_date as date) from employee;
-- 二进制转换
select bin(10);
-- 日期函数
select unix_timestamp();
select CURRENT_TIMESTAMP();
select unix_timestamp('2017-06-13', 'yyyy-mm-dd');
select from_unixtime(unix_timestamp(), 'yyyy-MM-dd HH:mm:ss:SSS');
select date_format('2015-12-12 12:33:15', 'yyyy-MM-dd HH:mm:ss');
--Returns default value if value is null else returns value (as of HIve 0.11).
select emp_name
,nvl(status_salary,0)
from employee;
select emp_name
,COALESCE(status_salary,0)
from employee;
hive Import/Export
export table emplyee to '/user/root/hiveexport/employee';
import table employee from '/user/root/hiveexport/employee';
select * from employee;
export table order_yyyymmdd partition(order_date='20170613') to '/user/root/hiveexport/order_yyyymmdd';
alter table order_yyyymmdd drop partition(order_date='20170613');
import table order_yyyymmdd partition(order_date='20170613') from '/user/root/hiveexport/order_yyyymmdd';
select * from order_yyyymmdd where order_date='20170613';
相关链接