hive正则表达式
2022-05-05 本文已影响0人
cyc_twentyfive
1. Hive支持如下三个正则表达式:
- regexp
- regexp_extract
- regexp_replace
1.1 字符集合:
image.png1.2 重复次数
image.png1.3 组合操作符
image.png2. regexp
select rn
from(
select '11145678abc' as rn
union all
select '111456789abc' as rn
union all
select 'd111456789abc' as rn
)t
where rn regexp '\\d{8}' #有8个及以上数字的
where rn regexp '^\\d{8}' #以8个及以上数字开头
where rn regexp '^\\d{8}\\D' #以8个数字开头
3. regexp_replace
#语法
regexp_replace(string INITIAL_STRING, string PATTERN, string REPLACEMENT)
#返回用替换实例替换INITIAL_STRING中与模式中定义的java正则表达式语法匹配的所有子字符串所产生的字符串
测试数据
select id1,
regexp_replace(str,'([^\\u4E00-\\u9FA5]+)','') as new_str1, --截取汉字部分
regexp_replace(str,'([^0-9]+)','') as new_str2, --截取数字部分
regexp_replace(str,'([^a-zA-Z]+)','') as new_str3, --截取字母部分
regexp_replace(str,'([^a-zA-Z0-9]+)','') as new_str4 --截取字母和数字
from(
select 1 as id1,'我在学习Hive,大数据。' as str
union all
select 2 as id1,'Hive,我来了,Coming666。' as str
union all
select 3 as id1,'666,Hive居然拥有关系型数据库的诸多特性。' as str
union all
select 4 as id1,'wuwuwu,Hive学习起来还是存在一定难度' as str
union all
select 5 as id1,'Hive数据仓库,6666。' as str
)t
4. regexp_extract
#语法
regexp_extract(string subject, string pattern, int index)
#index=0返回匹配上的所有字符串,index=1表示匹配第一个括号的表达式,index=2匹配第二个括号的表达式,以此类推
测试数据
select id1,
regexp_extract(str,'(filtertype"\\:")(\\d+)(",)',2) as new_str1, --截取filter数字
regexp_extract(str,'(filtername"\\:")((\\W*\\w*)|(\\w*))(",)',2) as new_str2, --截取filtername
regexp_extract(str,'(filtertitle"\\:")((\\W*\\w*)|(\\w*))(",)',2) as new_str3, --截取filtertitle
regexp_extract(str,'(filterid"\\:")(\\d+\\|\\d+)(",)',2) as new_str4, --截取filter_id
regexp_extract(str,'(filtertype"\\:")(\\d+)(",)',0) as new_str5
from(
select 1 as id1,'{"filtertype":"29","filtername":"成人Node","filtertitle":"成人Group","filtersubtype":"","filterid":"29|1","filterValue":"1|4"}' as str
union all
select 2 as id1,'{"filtertitle":"钻级","filtertype":"16","filtersubtype":"","filtername":"四钻/高档","filterid":"16|4",}' as str
)t