Hive操作指南

Hive操作指南

DDL

数据库

1
2
create database if not exists db_hive;
use db_hive;

1
show tables;

创建表

1
2
3
4
5
6
7
CREATE TABLE if not exists db_hive.db_table(
cookieid string,
product_category map<string,string>,
product_pv int,
add2cart_category map<string,string>,
add2cart_item_qty int
);

创建临时表

1
2
create temporary table tmp_fact_sale as
select * from ods_fact_sale limit 1000000;

查看表详情

1
desc db_table;

查看表创建语句

1
show create table table1;

删除表

1
drop table db_hive.db_table;

分区表

1
2
3
4
5
6
7
8
9
CREATE TABLE if not exists db_hive.db_table(
cookieid string,
product_category map<string,string>,
product_pv int,
add2cart_category map<string,string>,
add2cart_item_qty int
)
STORED AS ORC
LOCATION '/user/hive/db_hive.db_table';

删除数据

1
alter table db_hive.db_table drop partition(datekey='20220526');

这里是按分区删除相应的数据

插入数据

1
2
3
4
5
6
7
8
insert into table ecbd.db_demo values('truman',18),('michell',19);

INSERT OVERWRITE TABLE ecbd.db_demo1
SELECT
user_name,
age
FROM
ecbd.db_demo;

函数

查看所有的函数

1
show functions;

中位数

percentile(cast(discount_amount as int), 0.5)

1
select percentile(cast(discount_amount as int), 0.5) from db_hive.db_table where discount_amount is not null  limit 10;

绝对值

abs(discountamount)

行转map

1
2
3
select name as name,str_to_map(concat_ws(",",collect_set(concat_ws(':', subjects, cast(score as string)) ))) as sub,count(1) as num
from ecbd.users
group by name;

查询

case when

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
SELECT
cookieid,
CASE
WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'order%' THEN NULL
WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'product%' THEN NULL
ELSE LOWER(regexp_replace(split(products, '\\;') [ 1 ], ',', ''))
END AS itemnumber,
CASE
WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'order%' THEN 0
WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'product%' THEN 0
ELSE split(split(split(products, '\\|') [ 1 ],'event17\\=')[1],"\\;")[0]
END AS Quantity,
CASE
WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'order%' THEN 0
WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'product%' THEN 0
ELSE split(split(products, '\\|') [ 0 ],'event16\\=')[1]
END AS UnitPrice,
products
FROM
db_hive.db_table
LIMIT
10;

非数字

1
select * from db_hive.db_table  where category  not rlike '^\\d+$'  limit 10;
1
2
123 rlike '^\\d+$' : true
dd rlike '^\\d+$' : false

处理复杂类型Map、Array、Json

Map

1
2
3
4
# value查询
select map_values(smap) as s_values,map_values(imap) as i_values from temp_db.map_test;
# key查询
select map_keys(smap) as smap_keys,map_keys(imap) as imap_keys from temp_db.map_test;

map value求和

1
2
3
4
5
6
7
8
SELECT
customer_no,
SUM(v1) AS cate_sum
FROM
temp_db.map_test LATERAL VIEW explode(product_category) e1 AS k1,
v1
GROUP BY
customer_no;

map行转列

1
2
3
4
5
6
7
8
9
10
11
12
SELECT
product_category_key,
product_category_value,
add2cart_category_key,
add2cart_category_value,
order_category_key,
order_category_value
FROM
ecbd.tbl_persona_last_month_customer_features_with_cluster t_customer_cluster LATERAL view explode(t_customer_cluster.product_category) b AS product_category_key,
product_category_value LATERAL view explode(t_customer_cluster.add2cart_category) b AS add2cart_category_key,
add2cart_category_value LATERAL view explode(t_customer_cluster.order_category) b AS order_category_key,
order_category_value;

行转map

1
2
3
4
5
6
7
8
9
10
11
12
SELECT
customer_no,
str_to_map(
concat_ws(
",",
collect_set(concat_ws(':', subcategory, CAST(pv AS string)))
)
) AS product_subcategory
FROM
subcategory_group
GROUP BY
subcategory_group.customer_no;

Array

查看数组中是否包含

1
2
3
select *
from temp_db.array_test
where array_contains(year_arr,'1990');

数组拆成多条记录

1
2
3
select col1
from temp_db.array_test
lateral view explode(year_arr) tb as col1

时间处理

获取当前日期,格式:2022-07-08

1
2
select CURRENT_DATE();

格式化时间,格式:yyyyMMdd

1
select TRANSLATE(cast(CURRENT_DATE() as string), '-', '');

日期加减date_sub/date_add

1
2
select TRANSLATE(date_sub(CURRENT_DATE(), 2), '-', '');
select TRANSLATE(date_add(CURRENT_DATE(), 2), '-', '');

Hive和Hbase表关联

1
2
3
4
5
6
7
8
9
CREATE TABLE if not exists hive_to_hbase(
rowkey string,
name string,
age int
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES
("hbase.columns.mapping"=":key,info:name,info:age")
TBLPROPERTIES("hbase.table.name"="hive_to_hbase");

自定义变量

1
2
3
set hivevar:test=TRANSLATE(date_sub(CURRENT_DATE(), 32), '-', '');

SELECT ${hivevar:test};

好像只能用在查询中

调优

修改内存

1
2
set mapreduce.map.memory.mb=7048;
set mapreduce.reduce.memory.mb=2048;

并行,reduce数量优化

1
2
set hive.exec.parallel=true
set hive.exec.reducers.max=300

并行执行

1
2
set hive.exec.parallel=true; //打开任务并行执行
set hive.exec.parallel.thread.number=16; //同一个sql允许最大并行度,默认为8。

某些情况需要,当小表也别大的时候,Mapjoin优化失败需要关闭

1
set hive.auto.convert.join = false;

动态分区参数优化

1
2
3
4
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=100000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set hive.exec.max.created.files=900000;

数据倾斜常见优化,并不一定保证最优

1
2
set hive.map.aggr=true;  
set hive.groupby.skewindata = ture;

HiveSQL调优

from … insert into …

1
2
3
4
5
6
7
8
9
from stu_ori 

insert into table stu partition(tp)
select s_age,max(s_birth) stat,'max' tp
group by s_age

insert into table stu partition(tp)
select s_age,min(s_birth) stat,'min' tp
group by s_age;

注意事项

cast 类型转换失败,默认赋值为null

concat_ws(‘:’,”178”,CAST(NULL AS string)) 生成结果为178,不带:

hive和hbase关联表不支持partition分区

case 案例

资源

  1. hive教程
  2. Hive全面整理四万字,全面总结,附详细解析,赶紧收藏吧!!
  3. hive官网 LanguageManual
  4. HPL/SQL Reference