set hive.enforce.bucketing = true; set hive.enforce.sorting = true; set hive.exec.reducers.max = 1; CREATE TABLE tbl1(key int, value string) PARTITIONED by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS rcfile; CREATE TABLE tbl2(key int, value string) PARTITIONED by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS rcfile; alter table tbl1 set serde 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; alter table tbl2 set serde 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; insert overwrite table tbl1 partition (ds='1') select * from src where key < 10; insert overwrite table tbl2 partition (ds='1') select * from src where key < 10; alter table tbl1 change key key int; insert overwrite table tbl1 partition (ds='2') select * from src where key < 10; alter table tbl1 change key key string; -- The subquery itself is being map-joined. Multiple partitions of tbl1 with different schemas are being read for tbl2 select /*+mapjoin(subq1)*/ count(*) from (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 join (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 on subq1.key = subq2.key; set hive.optimize.bucketmapjoin = true; set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; -- The subquery itself is being map-joined. Since the sub-query only contains selects and filters, it should -- be converted to a bucketized mapside join. Multiple partitions of tbl1 with different schemas are being read for each -- bucket of tbl2 select /*+mapjoin(subq1)*/ count(*) from (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 join (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 on subq1.key = subq2.key; set hive.optimize.bucketmapjoin.sortedmerge = true; -- The subquery itself is being map-joined. Since the sub-query only contains selects and filters, it should -- be converted to a sort-merge join. Multiple partitions of tbl1 with different schemas are being read for a -- given file of tbl2 select /*+mapjoin(subq1)*/ count(*) from (select a.key as key, a.value as value from tbl1 a where key < 6) subq1 join (select a.key as key, a.value as value from tbl2 a where key < 6) subq2 on subq1.key = subq2.key; -- Since the join key is modified by the sub-query, neither sort-merge join not bucketized map-side -- join should be performed. Multiple partitions of tbl1 with different schemas are being read for tbl2 select /*+mapjoin(subq1)*/ count(*) from (select a.key+1 as key, concat(a.value, a.value) as value from tbl1 a) subq1 join (select a.key+1 as key, concat(a.value, a.value) as value from tbl2 a) subq2 on subq1.key = subq2.key;