set hive.mapred.supports.subdirectories=true; set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; set hive.merge.smallfiles.avgsize=200; set mapred.input.dir.recursive=true; set hive.merge.mapfiles=false; set hive.merge.mapredfiles=false; -- list bucketing alter table ... concatenate: -- Use list bucketing DML to generate mutilple files in partitions by turning off merge -- dynamic partition. multiple skewed columns. merge. -- The following explains merge example used in this test case -- DML will generated 2 partitions -- ds=2008-04-08/hr=a1 -- ds=2008-04-08/hr=b1 -- without merge, each partition has more files -- ds=2008-04-08/hr=a1 has 2 files -- ds=2008-04-08/hr=b1 has 6 files -- with merge each partition has more files -- ds=2008-04-08/hr=a1 has 1 files -- ds=2008-04-08/hr=b1 has 4 files -- The following shows file size and name in each directory -- hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME: -- without merge -- 155 000000_0 -- 155 000001_0 -- with merge -- 254 000000_0 -- hr=b1/key=103/value=val_103: -- without merge -- 99 000000_0 -- 99 000001_0 -- with merge -- 142 000001_0 -- hr=b1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME: -- without merge -- 5181 000000_0 -- 5181 000001_0 -- with merge -- 5181 000000_0 -- 5181 000001_0 -- hr=b1/key=484/value=val_484 -- without merge -- 87 000000_0 -- 87 000001_0 -- with merge -- 118 000002_0 -- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) -- JAVA_VERSION_SPECIFIC_OUTPUT -- create a skewed table create table list_bucketing_dynamic_part (key String, value String) partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE; -- list bucketing DML without merge. use bucketize to generate a few small files. explain extended insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr) select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08'; insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr) select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08'; -- check DML result show partitions list_bucketing_dynamic_part; desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1'); -- concatenate the partition and it will merge files alter table list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1') concatenate; desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1'); set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; select count(1) from srcpart where ds = '2008-04-08'; select count(*) from list_bucketing_dynamic_part; explain extended select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484'; select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484'; select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484' order by hr; -- clean up drop table list_bucketing_dynamic_part;