实训总结20170916
CREATE TABLE city(
province_code INT,
province_name string,
city_code INT,
city_name string
)
ROW FORMAT delimited
fields terminated by ','
lines terminated by '\n';
load数据到hive表: load data local inpath '/home/bigdata//hive/city.txt' into table city;
查看创建表 show create table city;
查看表信息 desc city;
修改表:alter table city rename to city_name;
复制表: create table city like city_code;
删除库 drop database **;
删除表 drop table **;
查表内容 select * from city limit 10;
显示列名 set hive.cli.print.header=true;
建库
建表
查询相关命令 最大的10个:select * from city order by city_code desc limit 10;
拥有最多市的省份:省里市的数目:取最多的10个省:
select province_name,count(city_name) as cnt from city group by province_name order by cnt desc limit 10;
去重统计 有多少个省:distinct
1)select count(distinct province_name) from city;
两重:2)select count(1)
from
(
select province_name from city group by province_name
) a;
查出只有一个市的省份:省有多少市 ;市为1
1)
select
province_name, cnt
from
(
select province_name,count(1) as cnt
from city
group by province_name
)a
where cnt =1;
2) select province_name count(1) as cnt from city group by province_name having cnt = 1;
外表:
table前加关键字external
远端要用的location'/user/hzw/city/city.txt'
create external table city_ex(
province_code int,
province_name string,
city_code int,
city_name string)
row format delimited
fields terminated by','
lines terminated by'\n'
location '/user/hzw/city/';
内表:
create table city_in(
province_code int,
province_name string,
city_code int,
city_name string)
row format delimited
fields terminated by','
lines terminated by'\n'
location '/user/hzw/city/';
2、使用分区patition对应表中的一个目录;
静态分区需要在sql语句中指定;
CREATE TABLE `user`(
`uid` INT,
`city_code` INT,
`model` string,
`access` string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n';
load data local inpath '/home/bigdata/tanqi/hive/user.txt' into table user;
查询user: select count(*) from user;
a.静态分区:
create table user_daily(
uid int,
city_code int,
model string,
access string
)
partitioned by (p_data string);
插入数据从user :
insert overwrite table user_daily partition (p_data='2017-09-01')
select * from user;
b.动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table user_daily partition (p_data)
select *,'2017-09-02' from user
union all
select *,'2017-09-04' from user;
ALTER TABLE user_daily PARTITION (p_date='2017-01-01') RENAME TO PARTITION (p_date='20170101');
3.作业
a) 统计WIFI环境下用户最多的5个城市
Select city_code,count(*) as cnt from user where access='WIFI'
group by city_code order by cnt desc limit 5;
b) 统计用户数>=3的access和city_code组合
select access,city_code,count(*) as cnt from user
group by access,city_code
having cnt >=3 order by cnt desc;
c) 机型中带有ne(不区分大小写),不同access及对应用户数
select access,count(*) as cnt from user
where lower(model) like '%ne%'
group by access order by cnt desc;
判断 access里的名/总数的sum
select sum(if(access='WIFI',1,0))/count(1) from user;
select sum(if(access='2G',1,0))/count(1) from user;
select sum(if(access='4G',1,0))/count(1) from user;
if条件多
select
case
when uid % 10 in (0, 1, 2, 3) then '0-3'
when uid % 10 in (4, 5, 6, 7) then '4-7'
else '8-9'
end as interval,
count(*) as cnt
from user
group by
case
when uid % 10 in (0, 1, 2, 3) then '0-3'
when uid % 10 in (4, 5, 6, 7) then '4-7'
else '8-9'
end;
晚间
1、 list 集合
collect_set 去重的集合
collect_list 不去重
select collect_set(access) from user;
select collect_list(access) from user;
2、 hive 各种连接join
左连接 left outer join
右连接 right outer join
内连接 inner join 找出左右相同(AB相交)的记录
全连接 full join
select user.uid,user.city_code,city.city_name
from
(select * from user where uid <=100) user
left join
(select * from city where province_code <=30)city
on (user.city_code = city.city_code)
limit 20;
select user.uid,user.city_code,city.city_name
from
(select * from user where uid <=100) user
full join
(select * from city where province_code <=30)city
on (user.city_code = city.city_code)
limit 20;
3、分组top N
分组 access
排序 序号 city_code
top1 序号=1
select access,city_code,uid
from
(
select uid, access,city_code,
row_number() over (partition by access order
by city_code desc)as row_num
from user
)a
where row_num = 1;
4、累计 第一行 当前行
select p_date,
sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date, count(*) as cnt
from user_daily
where p_date between '2017-09-01' and '2017-09-30'
group by p_date
)a
;