HiveQL MR基本操作

2017-06-13  本文已影响79人  天堂宝宝_V

emplyee.txt 文件内容,中间为tab分割,utf8文件

1       张三    test    4000    500     2014-05-16      8       1
2       李四    coder   8000    1000    2015-08-12      3       2
3       王五    pm      4000    1500    2014-05-16      4       4
4       赵六    cto     30000   1000    2015-11-12      5       2
5       田七    ceo     40000           2014-05-16              4
6       蔡八    cfo     35000   1000    2015-08-12      5       4
7       孙久    pe      4500    500     2013-05-16      4       4
8       张魁    pm      10000   1000    2015-08-12      4       4
9       小梦    test    3000    500     2014-05-16      8       1
10      小黄    coder   9000    1000    2015-10-12      3       2
11      小明    coder   12000   500     2017-05-16      3       2
12      小红    coder   15000   1000    2016-04-12      3       2
13      小花    test    1000    500     2014-05-16      8       1
14      小兰    coder   8000    1000    2015-03-12      3       2
15      小宋    hr      17500   2500    1999-08-12      3       3
16      小江    hr      7500    1000    2009-08-12      3       3
17      marvis  reps    9500    2000    2005-08-12      0       13
18      liming  pa              2500            0       11
19      xiaosun pa      7500    1000    2009-10-12      0       12
20      zhangmi se              2000    2005-12-12      0       10

department.txt,中间为tab分割,utf8文件

1       测试部  郑州经开区
2       开发部  郑州经开区
3       人事部  深圳南山区
4       总经办  郑州金水区
5       财务部  深圳南山区
6       销售部  深圳南山区
7       aftersail       beijing
8       beforesail      beijing

HiveQL 创建表删除表,导入数据,查看数据 ,注意导入的文件需要使用utf8编码

创建员工表
drop table if exists emplyee;
create table if not exists emplyee(
    emp_code    bigint
    ,emp_name   string
    ,status string
    ,salary decimal(20,2)
    ,status_salary  decimal(20,2)
    ,work_begin_date string
    ,parent_code    bigint
    ,belong_dep_code    bigint
)
row format delimited fields terminated by '\t'
STORED AS textfile;

describe emplyee;


-- 创建部门表
drop table if exists department;
create table if not exists department(
    dep_code bigint
    ,dep_name string
    ,dep_address string
)
row format delimited fields terminated by '\t'
stored as textfile;

describe department;

show tables;

load data local inpath '/root/Desktop/emplyee.txt' overwrite into table emplyee;

load data local inpath '/root/Desktop/department.txt' overwrite into table department;

select * from emplyee;

select * from department;

select to_date(work_begin_date) from emplyee;

-- insert into 语法的使用
drop table if exists emplyee5000;
create table emplyee5000 like emplyee;
insert into emplyee5000 
select * from emplyee where salary>5000;

select * from emplyee5000;

show functions;

gropu by

-- 统计 status_salary 不为null的有多少个。注意cout 不统计 null的特点
select count(status_salary) from emplyee;

-- 统计有多少个人职位为coder
select count(1) from emplyee where status='coder';


-- 计算员工总薪水,平均薪水,最高工资,最低工资。totalSalary 不能直接使用
select count(1) as emp_num
    ,sum(salary) as totalSalary
    ,sum(salary)/count(1) avg
    ,max(salary) max
    ,min(salary) min
from emplyee;

-- 计算部门的最高薪水,最低薪水,平均薪水,总薪水,总人数;
-- 写法特点是先写group by对应的名称,然后直接跟在select 后面首位,后面跟的必须都是聚合函数
select belong_dep_code
    ,max(salary) max
    ,min(salary) min
    ,avg(salary) avg
    ,sum(salary) sum
    ,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code

-- 计算不显示部门code要显示部门名称
select a.dep_name,b.emp_count,b.max,b.min,b.avg,b.sum from department a,
(select belong_dep_code code
    ,max(salary) max
    ,min(salary) min
    ,avg(salary) avg
    ,sum(salary) sum
    ,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code) b
where a.dep_code = b.code

-- 计算岗位的最高薪水,最低薪水,平均薪水,总薪水,总人数;
select status
    ,max(salary) max
    ,min(salary) min
    ,avg(salary) avg
    ,sum(salary) sum
    ,count(1) emp_count
from emplyee
where status is not null
group by status

select * from emplyee where belong_dep_code in(
select dep_code from department  where dep_address like '郑州%');

-- 计算岗位的最高薪水,最低薪水,平均薪水,且平均薪水大于7500;
-- 第一种写法
select * from(
    select status
        ,max(salary) max
        ,min(salary) min
        ,avg(salary) avg
        ,sum(salary) sum
        ,count(1) emp_count
    from emplyee
    where status is not null
    group by status
)a 
where a.avg > 7500

-- 第二种写法
select status
        ,max(salary) max
        ,min(salary) min
        ,avg(salary) avg
        ,sum(salary) sum
        ,count(1) emp_count
    from emplyee
    where status is not null
    group by status
    having avg(salary) > 7500

select count(1) from emplyee;

-- 查询出最高薪水的人的信息
select * 
from emplyee 
where salary in (select max(salary) from emplyee)


-- 计算平均薪水放入avg_salary 表(拆分法)
create table avg_salary as
select avg(salary) as salary from emplyee;

select * from avg_salary;

-- 把 emplyee 和avg_salary 笛卡尔乘积
create table emplyee_with_avgsalary as 
select a.* ,b.salary as avg_salary
from emplyee a,avg_salary b;

select * from emplyee_with_avgsalary
where salary > avg_salary;

-- 汇总
select a.*,b.*
from emplyee a,
    (select avg(salary) as avg_salary from emplyee)b
where a.salary>b.avg_salary

-- 查询总经办下面的岗位有哪些 in写法
select distinct status from emplyee
where belong_dep_code in (
    select dep_code from department where dep_name='总经办'
    );
-- 查询总经办下面的岗位有哪些 exists写法
select distinct status from emplyee a
where exists (
    select 1 from department b where a.belong_dep_code = b.dep_code and dep_name='总经办'
    );

select * from department;

-- 薪水大于8000或者小于2000或者等于5000的员工 union写法
explain -- 查询执行流程
select * from emplyee where salary > 8000
union all
select * from emplyee where salary < 2000
union all
select * from emplyee where salary = 5000

explain
select  * from emplyee where salary > 8000 or salary < 2000 or salary = 5000

hive join

describe emplyee;
describe department;

select * from emplyee;

select * from department;

load data local inpath '/root/Desktop/emplyee.txt' overwrite into table emplyee;

load data local inpath '/root/Desktop/department.txt' overwrite into table department;
-- 列出员工的姓名和所在的部门的名称和地址 内连接
select a.emp_code,a.emp_name,b.dep_name,b.dep_address
from emplyee a
join department b
on a.belong_dep_code = b.dep_code

-- 左联接
select a.emp_code, a.emp_name,b.dep_name,b.dep_address
from emplyee a
left join department b
on a.belong_dep_code = b.dep_code

-- 列出所有员工的姓名和他上司的姓名
select a.emp_name,b.emp_name from emplyee a
join emplyee b
on a.parent_code=b.emp_code

-- 列出员工的姓名和他所在的部门,把没有员工的部门也列举出来 
-- left join后面的where 必须是主表的查询条件。副表的条件可以跟到附表连接on的后面,
-- 否则加到where上则该左联接失效会变成内连接
select b.emp_name,a.dep_name from department a
left join emplyee b
on a.dep_code = b.belong_dep_code

-- 哪些部门没有员工 
select b.emp_name,a.dep_name from department a
left outer join emplyee b
on a.dep_code = b.belong_dep_code 
where b.belong_dep_code is null

-- 哪些部门没有员工
select distinct * from department a
where not exists (
    select 1 from emplyee b where b.belong_dep_code = a.dep_code
    );

-- 查询所有部门的部门编号,部门名称,部门地址,员工人数,月薪水支出,人均薪水
explain
select c.dep_code
    ,c.dep_name
    ,c.dep_address
    ,b.cnt
    ,b.total
    ,b.average
from department c 
left join   (select a.belong_dep_code as dep_code
        ,count (*) cnt
        ,sum(salary) total
        ,avg(salary) average
    from emplyee a
    group by a.belong_dep_code)b
on c.dep_code=b.dep_code;

explain
select a.dep_code
        ,a.dep_name
        ,a.dep_address
        ,count(b.emp_code)
        ,sum(b.salary)
        ,avg(b.salary)
from department a
left join   emplyee b
on a.dep_code=b.belong_dep_code
group by a.dep_code
        ,a.dep_name
        ,a.dep_address


-- 查询所有员工的员工编号,姓名,薪水,部门名称,上司姓名
explain
select c.code 
    ,c.name
    ,c.salary
    ,d.dep_name
    ,c.super
from department d,
    (select a.emp_code as code
        ,a.emp_name as name
        ,a.salary as salary
        ,b.emp_name  as super
        ,a.belong_dep_code as dep_code
    from emplyee a
    join emplyee b
    on a.parent_code=b.emp_code)c
where d.dep_code=c.dep_code
order by c.code

-- 查询所有员工的员工编号,姓名,薪水,部门名称,上司姓名
explain
select a.emp_code
        ,a.emp_name
        ,a.salary
        ,c.dep_name
        ,b.emp_name 
from emplyee a
left join emplyee b
on a.parent_code=b.emp_code
left join department c
on a.belong_dep_code=c.dep_code
order by a.emp_code

-- 每个部门最高薪水的人
select a.*
    ,c.dep_name
from emplyee a
join (select belong_dep_code as dep_code
        ,max(salary) max_salary
    from emplyee 
    group by belong_dep_code)b
on a.belong_dep_code = b.dep_code and a.salary=b.max_salary
inner join department c
on c.dep_code=b.dep_code

hive 分组

-- 统计 status_salary 不为null的有多少个。注意cout 不统计 null的特点
select count(status_salary) from emplyee;

-- 统计有多少个人职位为coder
select count(1) from emplyee where status='coder';


-- 计算员工总薪水,平均薪水,最高工资,最低工资。totalSalary 不能直接使用
select count(1) as emp_num
    ,sum(salary) as totalSalary
    ,sum(salary)/count(1) avg
    ,max(salary) max
    ,min(salary) min
from emplyee;

-- 计算部门的最高薪水,最低薪水,平均薪水,总薪水,总人数;
-- 写法特点是先写group by对应的名称,然后直接跟在select 后面首位,后面跟的必须都是聚合函数
select belong_dep_code
    ,max(salary) max
    ,min(salary) min
    ,avg(salary) avg
    ,sum(salary) sum
    ,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code

-- 计算不显示部门code要显示部门名称
select a.dep_name,b.emp_count,b.max,b.min,b.avg,b.sum from department a,
(select belong_dep_code code
    ,max(salary) max
    ,min(salary) min
    ,avg(salary) avg
    ,sum(salary) sum
    ,count(1) emp_count
from emplyee
where belong_dep_code is not null
group by belong_dep_code) b
where a.dep_code = b.code

-- 计算岗位的最高薪水,最低薪水,平均薪水,总薪水,总人数;
select status
    ,max(salary) max
    ,min(salary) min
    ,avg(salary) avg
    ,sum(salary) sum
    ,count(1) emp_count
from emplyee
where status is not null
group by status

select * from emplyee where belong_dep_code in(
select dep_code from department  where dep_address like '郑州%');

-- 计算岗位的最高薪水,最低薪水,平均薪水,且平均薪水大于7500;
-- 第一种写法
select * from(
    select status
        ,max(salary) max
        ,min(salary) min
        ,avg(salary) avg
        ,sum(salary) sum
        ,count(1) emp_count
    from emplyee
    where status is not null
    group by status
)a 
where a.avg > 7500

-- 第二种写法
select status
        ,max(salary) max
        ,min(salary) min
        ,avg(salary) avg
        ,sum(salary) sum
        ,count(1) emp_count
    from emplyee
    where status is not null
    group by status
    having avg(salary) > 7500

select count(1) from emplyee;

-- 查询出最高薪水的人的信息
select * 
from emplyee 
where salary in (select max(salary) from emplyee)


-- 计算平均薪水放入avg_salary 表(拆分法)
create table avg_salary as
select avg(salary) as salary from emplyee;

select * from avg_salary;

-- 把 emplyee 和avg_salary 笛卡尔乘积
create table emplyee_with_avgsalary as 
select a.* ,b.salary as avg_salary
from emplyee a,avg_salary b;

select * from emplyee_with_avgsalary
where salary > avg_salary;

-- 汇总
select a.*,b.*
from emplyee a,
    (select avg(salary) as avg_salary from emplyee)b
where a.salary>b.avg_salary

-- 查询总经办下面的岗位有哪些 in写法
select distinct status from emplyee
where belong_dep_code in (
    select dep_code from department where dep_name='总经办'
    );
-- 查询总经办下面的岗位有哪些 exists写法
select distinct status from emplyee a
where exists (
    select 1 from department b where a.belong_dep_code = b.dep_code and dep_name='总经办'
    );

select * from department;

-- 薪水大于8000或者小于2000或者等于5000的员工 union写法
explain -- 查询执行流程
select * from emplyee where salary > 8000
union all
select * from emplyee where salary < 2000
union all
select * from emplyee where salary = 5000

explain
select  * from emplyee where salary > 8000 or salary < 2000 or salary = 5000

hive 分区 桶

-- 在Hive Select查询中一般会扫描整个表内容,会消耗很多时间做没必要的工作。
-- 有时候只需要扫描表中关心的一部分数据,因此建表时引入了partition概念。
-- 分区表指的是在创建表时指定的partition的分区空间。?
create table if not exists order_yyyymmdd(
    order_id bigint
    ,order_time string
    ,order_money decimal(20,2)
    ,user_id bigint
    ,address_id bigint
    ,order_type int
)
partitioned by(order_date string)
row format delimited FIELDS TERMINATED BY ' ';

alter table order_yyyymmdd 
add partition(order_date= '20170613');

alter table order_yyyymmdd 
add partition(order_date ='20170612');

alter table  order_yyyymmdd
add partition(order_date ='20170610')
partition(order_date ='20170609')

show partitions order_yyyymmdd;

alter table order_yyyymmdd drop partition(order_date='20170609');

describe order_yyyymmdd;
-- load 数据的时候必须该数据符合分区的条分区条件自身本不会对数据的合法性进行校验。需要进行抽取清洗
load data local inpath '/root/Desktop/order.txt' overwrite into table order_yyyymmdd partition(order_date= '20170614')

select * from order_yyyymmdd where order_date='20170614';

-- 创建临时分区表,存储临时数据,用于对数据进行筛选
create table if not exists order_yyyymmdd_tmp 
like order_yyyymmdd;

load data local inpath '/root/Desktop/order.txt'  overwrite into table order_yyyymmdd_tmp partition(order_date='20170613');

show partitions order_yyyymmdd_tmp;

select * from order_yyyymmdd_tmp;
-- 提前删除一次分区,支持重跑
alter table order_yyyymmdd drop partition(order_date='20170613');

insert into order_yyyymmdd partition(order_date='20170613')
select order_id 
    ,order_time
    ,order_money
    ,user_id
    ,address_id
    ,order_type
from order_yyyymmdd_tmp
where order_date='20170613'
and order_time like '2017-06-13_%';

select * from order_yyyymmdd where order_date='20170613';

alter table order_yyyymmdd_tmp drop partition(order_date='20170613');

-- 创建分区加桶表
drop table if exists order_yyyymmdd_b;
create table order_yyyymmdd_b(
    order_id bigint
    ,order_time string
    ,order_money decimal(20,2)
    ,user_id bigint
    ,address_id bigint
    ,order_type int
)
PARTITIONED BY(order_date string)
CLUSTERED BY  (user_id) sorted by(user_id) into 2 BUCKETS
row format delimited FIELDS TERMINATED BY ' ';

describe order_yyyymmdd_b;

-- 新版本已经被废弃
set hive.enforce.bucketing = true;

alter table order_yyyymmdd_b drop partition(order_date='20170613');
insert into order_yyyymmdd_b partition(order_date='20170613') 
select order_id 
    ,order_time
    ,order_money
    ,user_id
    ,address_id
    ,order_type
from order_yyyymmdd
where order_date='20170613';

select * from order_yyyymmdd_b
where order_date='20170613';

hive Order By、Sort By、Cluster By、Distribute By 与函数

-- order只能默认进行全排序,不能对多个节点,不符合语法规则
select *
from employee
order by salary desc
    ,status_salary desc;

-- 设置reducer个数
set mapred.reduce.tasks=2;
-- 显示最大的reducer数量
show conf  'hive.exec.reducers.max'

--  sort 是指单个reducer节点排序
drop table if exists sort_by;
create table sort_by 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
as select *
from employee
sort by salary desc;

select * from sort_by;

-- 根据status进行分区,同样的status会被分配到同一个区域,reduce个数大于1
drop table if exists distribute_test;
create table distribute_test
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
as select * 
from employee
distribute by status;

-- 按照职员的职位,将其工资由高到底排序
drop table if exists dstest;
create table dstest
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
as select * 
from employee
distribute by status -- 首先根据status将数据分配到不同的reduce节点,
sort by status,salary; -- 然后在根据status,salary对各个节点的数据进行排序,所有节点的数据组合起来不是全排序

--cluster by 就是distribute by 与sort by的结合,但是仅限字段一致的情况
select * 
from employee
cluster by status;

show functions;
--描述if方法的特点
describe function if;

describe function EXTENDED  if;

-- map类型
select map(emp_name,status)
from employee;

select am['李四']
from
    (select map(emp_name,status)am
    from employee)b;

--复杂类型数组
select aa[0] 
        ,am[1]
from(
    select array(emp_name,status)aa
    from employee)b

-- 复杂类型struct 
select a.persion.code
        ,a.persion.name
from (select named_struct('code',emp_code,'name',emp_name) persion
        from employee) a;

select emp_name
    ,if(status_salary is null,'普通岗位','带薪岗位')
from employee 

select * from employee;

-- 2.2.0版本才开始支持,目前版本暂时不支持
SELECT emp_name||status 
        FROM employee;

-- 类型转换
select cast (work_begin_date as date) from employee;
-- 二进制转换
select bin(10);

-- 日期函数
select  unix_timestamp();

select CURRENT_TIMESTAMP();

select unix_timestamp('2017-06-13', 'yyyy-mm-dd');

select from_unixtime(unix_timestamp(), 'yyyy-MM-dd HH:mm:ss:SSS');

select date_format('2015-12-12 12:33:15', 'yyyy-MM-dd HH:mm:ss');

--Returns default value if value is null else returns value (as of HIve 0.11).
select emp_name
    ,nvl(status_salary,0)
from employee;

select emp_name
    ,COALESCE(status_salary,0)
from employee;


hive Import/Export

export table emplyee to '/user/root/hiveexport/employee';

import table employee from '/user/root/hiveexport/employee';

select * from employee;

export table order_yyyymmdd partition(order_date='20170613') to '/user/root/hiveexport/order_yyyymmdd';

alter table order_yyyymmdd drop partition(order_date='20170613');

import table order_yyyymmdd partition(order_date='20170613') from  '/user/root/hiveexport/order_yyyymmdd';

select * from order_yyyymmdd where order_date='20170613';

相关链接

hive 语言手册

上一篇 下一篇

猜你喜欢

热点阅读