大数据
(1)查看yarn日志:
yarn logs -applicationId application_1530010276125_55093
(2)设置动态分区大小:
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode =nostrict;
(3)从hive导出文件到本地:
insert overwrite local directory '/tmp/test' row format delimited fields terminated by '|'
select * from default.pa_segment_active_cutoffs_ss_w;
(4)从本地导入hive
load data local inpath '/tmp/a.txt' into table cx.ai_info;
(5)查看表所占空间:
hadoop fs -du /user/hive/warehouse/pa_ubi_session_w|awk '{ SUM += $1 } END { print SUM/(1024*1024*1024)}'
(6)查杀hive job:
hadoop job -list
hadoop job -kill jobid
(7)导入hive中文乱码解决:
ALTER TABLE default.pa_dim_city_mapping SET SERDEPROPERTIES ('serialization.encoding'='GBK');
(8)hive小知识:
';'用 '\073表示'
(9)hive 建内部表:
create table cx.ai_info(id int) partitioned by (month int) stored as orc;
(10)mysql 导入数据:
load data local infile 'var/lib/mysql-files/td/a.txt' into table ai.a_info fields terminated by '|' lines terminated by '\n' ignore 1 rows;
指定字段
load data local infile '/tmp/test/000000_0'
into table segment_6_cycles_remaining
fields terminated by ','
(period_start,period_end,type,keyname,remain0,remain1,remain2,remain3,remain4,remain5,remain6,companyid,projectid);
(11)服务器之间拷贝文件:
scp a.txt hadoop@10.20.24.103:/tmp/
(12) zip压缩解压
tar -czf all.tar.gz *.jpg
tar -xzf all.tar.gz
zip filename.zip filename
unzip filename.zip
参考:https://www.jb51.net/LINUXjishu/43356.html
(13)动态非严格分区:
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode =nostrict;
(14)三范式:
1.第一范式(确保每列保持原子性)
2.第二范式(确保表中的每列都和主键相关)
3.第三范式(确保每列都和主键列直接相关,而不是间接相关)
(15)shell求双周
BATCH_DATE=${1} #输入日期
INIT_DAY='20181105' #定义一个初始化的周一的日期
startDate=`date -d ${INIT_DAY} +%s` #初始化日期所对毫秒数
endDate=`date -d ${BATCH_DATE} +%s` #输入日期所对毫秒数
stampDiff=`expr $endDate - $startDate` #两者之间相差的毫秒数
weekDiff=`expr $stampDiff / 1209600` #两者之间相差的周数
n_end=$(($weekDiff*14)) #两者之间相差的天数
STR_WEEK_LAST_DAY=$(date -d "$INIT_DAY +${n_end} days" +%Y%m%d) #初始化日期加上相差天数就是所对的周一
echo "双周初:"
echo $STR_WEEK_LAST_DAY
echo "双周周末:"
STR_WEEK_END_DAY=`date -d "20181126 + 13 days " +%Y%m%d`
echo $STR_WEEK_END_DAY
(16)wget数据
[hdfs@cdh-node1 20181221]$ python -m SimpleHTTPServer
[hdfs@cdhmaster load]$ wget http://10.1.1.30:8000/pa_page_group_daily_sm.txt
(17)hive 表锁
查看锁:show locks 表名
锁分为表锁,分区锁
unlock table 表名; -- 解锁表
unlock table 表名 partition(dt='2014-04-01'); -- 解锁某个分区
(18)sqoop
sqoop export --connect jdbc:mysql://marketclouddb1.mysqldb.chinacloudapi.cn:3306/marketing_cloud --username marketclouddb1%datatist --password Datatist1506 --table t_table1 --columns name1,id1 --hcatalog-database default --hcatalog-table t_table1
sqoop export --connect --username --password --table ${table} --columns ${columns} --hcatalog-database ${hive} --hcatalog-table ${table} --update-key registration_id,user_id,project_id --update-mode allowinsert
(19)UDF函数
add file/xxx/person.py
selecttransform(name,idcard)USING'python person.py'AS(name,idcard,gender) from person;
(20)
du -h 查看文件/目录大小
(21)
将hdfs拷贝到本地服务器
hdfs dfs -copyToLocal inputPath outputPath
将本地文件拷贝到hdfs
hdfs dfs -copyFromLocal inputPath outputPath
(22)
查看hdfs文件大小
hadoop fs -du -s -h /user/hive/warehouse/ods.db
(23)建表
create table default.dw_dim_date_config(
date_day int
,date_hor string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
;