create database if not exists db_hive; use db_hive;
表
1
show tables;
创建表
1 2 3 4 5 6 7
CREATE TABLE if not exists db_hive.db_table( cookieid string, product_category map<string,string>, product_pv int, add2cart_category map<string,string>, add2cart_item_qty int );
创建临时表
1 2
create temporary table tmp_fact_sale as select * from ods_fact_sale limit 1000000;
查看表详情
1
desc db_table;
查看表创建语句
1
show create table table1;
删除表
1
drop table db_hive.db_table;
分区表
1 2 3 4 5 6 7 8 9
CREATE TABLE if not exists db_hive.db_table( cookieid string, product_category map<string,string>, product_pv int, add2cart_category map<string,string>, add2cart_item_qty int ) STORED AS ORC LOCATION '/user/hive/db_hive.db_table';
删除数据
1
alter table db_hive.db_table drop partition(datekey='20220526');
这里是按分区删除相应的数据
插入数据
1 2 3 4 5 6 7 8
insert into table ecbd.db_demo values('truman',18),('michell',19);
INSERT OVERWRITE TABLE ecbd.db_demo1 SELECT user_name, age FROM ecbd.db_demo;
函数
查看所有的函数
1
show functions;
中位数
percentile(cast(discount_amount as int), 0.5)
1
select percentile(cast(discount_amount as int), 0.5) from db_hive.db_table where discount_amount is not null limit 10;
绝对值
abs(discountamount)
行转map
1 2 3
select name as name,str_to_map(concat_ws(",",collect_set(concat_ws(':', subjects, cast(score as string)) ))) as sub,count(1) as num from ecbd.users group by name;
SELECT cookieid, CASE WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'order%' THEN NULL WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'product%' THEN NULL ELSE LOWER(regexp_replace(split(products, '\\;') [ 1 ], ',', '')) END AS itemnumber, CASE WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'order%' THEN 0 WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'product%' THEN 0 ELSE split(split(split(products, '\\|') [ 1 ],'event17\\=')[1],"\\;")[0] END AS Quantity, CASE WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'order%' THEN 0 WHEN LOWER(split(products, '\\;') [ 1 ]) LIKE 'product%' THEN 0 ELSE split(split(products, '\\|') [ 0 ],'event16\\=')[1] END AS UnitPrice, products FROM db_hive.db_table LIMIT 10;
非数字
1
select * from db_hive.db_table where category not rlike '^\\d+$' limit 10;
# value查询 select map_values(smap) as s_values,map_values(imap) as i_values from temp_db.map_test; # key查询 select map_keys(smap) as smap_keys,map_keys(imap) as imap_keys from temp_db.map_test;
map value求和
1 2 3 4 5 6 7 8
SELECT customer_no, SUM(v1) AS cate_sum FROM temp_db.map_test LATERAL VIEW explode(product_category) e1 AS k1, v1 GROUP BY customer_no;
map行转列
1 2 3 4 5 6 7 8 9 10 11 12
SELECT product_category_key, product_category_value, add2cart_category_key, add2cart_category_value, order_category_key, order_category_value FROM ecbd.tbl_persona_last_month_customer_features_with_cluster t_customer_cluster LATERAL view explode(t_customer_cluster.product_category) b AS product_category_key, product_category_value LATERAL view explode(t_customer_cluster.add2cart_category) b AS add2cart_category_key, add2cart_category_value LATERAL view explode(t_customer_cluster.order_category) b AS order_category_key, order_category_value;
行转map
1 2 3 4 5 6 7 8 9 10 11 12
SELECT customer_no, str_to_map( concat_ws( ",", collect_set(concat_ws(':', subcategory, CAST(pv AS string))) ) ) AS product_subcategory FROM subcategory_group GROUP BY subcategory_group.customer_no;
Array
查看数组中是否包含
1 2 3
select * from temp_db.array_test where array_contains(year_arr,'1990');
数组拆成多条记录
1 2 3
select col1 from temp_db.array_test lateral view explode(year_arr) tb as col1
时间处理
获取当前日期,格式:2022-07-08
1 2
select CURRENT_DATE();
格式化时间,格式:yyyyMMdd
1
select TRANSLATE(cast(CURRENT_DATE() as string), '-', '');
CREATE TABLE if not exists hive_to_hbase( rowkey string, name string, age int ) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping"=":key,info:name,info:age") TBLPROPERTIES("hbase.table.name"="hive_to_hbase");
自定义变量
1 2 3
set hivevar:test=TRANSLATE(date_sub(CURRENT_DATE(), 32), '-', '');
SELECT ${hivevar:test};
好像只能用在查询中
调优
修改内存
1 2
set mapreduce.map.memory.mb=7048; set mapreduce.reduce.memory.mb=2048;
并行,reduce数量优化
1 2
set hive.exec.parallel=true set hive.exec.reducers.max=300
并行执行
1 2
set hive.exec.parallel=true; //打开任务并行执行 set hive.exec.parallel.thread.number=16; //同一个sql允许最大并行度,默认为8。
某些情况需要,当小表也别大的时候,Mapjoin优化失败需要关闭
1
set hive.auto.convert.join = false;
动态分区参数优化
1 2 3 4
set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.max.dynamic.partitions=100000; set hive.exec.max.dynamic.partitions.pernode=100000; set hive.exec.max.created.files=900000;
数据倾斜常见优化,并不一定保证最优
1 2
set hive.map.aggr=true; set hive.groupby.skewindata = ture;
HiveSQL调优
from … insert into …
1 2 3 4 5 6 7 8 9
from stu_ori
insert into table stu partition(tp) select s_age,max(s_birth) stat,'max' tp group by s_age
insert into table stu partition(tp) select s_age,min(s_birth) stat,'min' tp group by s_age;
注意事项
cast 类型转换失败,默认赋值为null
concat_ws(‘:’,”178”,CAST(NULL AS string)) 生成结果为178,不带: