PREHOOK: query: drop table if exists staging PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists staging POSTHOOK: type: DROPTABLE PREHOOK: query: drop table if exists parquet_jointable1 PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists parquet_jointable1 POSTHOOK: type: DROPTABLE PREHOOK: query: drop table if exists parquet_jointable2 PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists parquet_jointable2 POSTHOOK: type: DROPTABLE PREHOOK: query: drop table if exists parquet_jointable1_bucketed_sorted PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists parquet_jointable1_bucketed_sorted POSTHOOK: type: DROPTABLE PREHOOK: query: drop table if exists parquet_jointable2_bucketed_sorted PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists parquet_jointable2_bucketed_sorted POSTHOOK: type: DROPTABLE PREHOOK: query: create table staging (key int, value string) stored as textfile PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@staging POSTHOOK: query: create table staging (key int, value string) stored as textfile POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@staging PREHOOK: query: insert into table staging select distinct key, value from src order by key limit 2 PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@staging POSTHOOK: query: insert into table staging select distinct key, value from src order by key limit 2 POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@staging POSTHOOK: Lineage: staging.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: staging.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] PREHOOK: query: create table parquet_jointable1 stored as parquet as select * from staging PREHOOK: type: CREATETABLE_AS_SELECT PREHOOK: Input: default@staging PREHOOK: Output: database:default PREHOOK: Output: default@parquet_jointable1 POSTHOOK: query: create table parquet_jointable1 stored as parquet as select * from staging POSTHOOK: type: CREATETABLE_AS_SELECT POSTHOOK: Input: default@staging POSTHOOK: Output: database:default POSTHOOK: Output: default@parquet_jointable1 PREHOOK: query: create table parquet_jointable2 stored as parquet as select key,key+1,concat(value,"value") as myvalue from staging PREHOOK: type: CREATETABLE_AS_SELECT PREHOOK: Input: default@staging PREHOOK: Output: database:default PREHOOK: Output: default@parquet_jointable2 POSTHOOK: query: create table parquet_jointable2 stored as parquet as select key,key+1,concat(value,"value") as myvalue from staging POSTHOOK: type: CREATETABLE_AS_SELECT POSTHOOK: Input: default@staging POSTHOOK: Output: database:default POSTHOOK: Output: default@parquet_jointable2 PREHOOK: query: -- SORT_QUERY_RESULTS -- MR join explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key PREHOOK: type: QUERY POSTHOOK: query: -- SORT_QUERY_RESULTS -- MR join explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage Stage-0 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-1 Map Reduce Map Operator Tree: TableScan alias: p1 Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: key (type: int) sort order: + Map-reduce partition columns: key (type: int) Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE TableScan alias: p2 Statistics: Num rows: 2 Data size: 6 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: key (type: int) sort order: + Map-reduce partition columns: key (type: int) Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE value expressions: myvalue (type: string) Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: 0 key (type: int) 1 key (type: int) outputColumnNames: _col7 Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: _col7 (type: string) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink PREHOOK: query: select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key PREHOOK: type: QUERY PREHOOK: Input: default@parquet_jointable1 PREHOOK: Input: default@parquet_jointable2 #### A masked pattern was here #### POSTHOOK: query: select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_jointable1 POSTHOOK: Input: default@parquet_jointable2 #### A masked pattern was here #### val_0value val_10value PREHOOK: query: -- The two tables involved in the join have differing number of columns(table1-2,table2-3). In case of Map and SMB join, -- when the second table is loaded, the column indices in hive.io.file.readcolumn.ids refer to columns of both the first and the second table -- and hence the parquet schema/types passed to ParquetInputSplit should contain only the column indexes belonging to second/current table -- Map join explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key PREHOOK: type: QUERY POSTHOOK: query: -- The two tables involved in the join have differing number of columns(table1-2,table2-3). In case of Map and SMB join, -- when the second table is loaded, the column indices in hive.io.file.readcolumn.ids refer to columns of both the first and the second table -- and hence the parquet schema/types passed to ParquetInputSplit should contain only the column indexes belonging to second/current table -- Map join explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-4 is a root stage Stage-3 depends on stages: Stage-4 Stage-0 depends on stages: Stage-3 STAGE PLANS: Stage: Stage-4 Map Reduce Local Work Alias -> Map Local Tables: p1 Fetch Operator limit: -1 Alias -> Map Local Operator Tree: p1 TableScan alias: p1 Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE HashTable Sink Operator keys: 0 key (type: int) 1 key (type: int) Stage: Stage-3 Map Reduce Map Operator Tree: TableScan alias: p2 Statistics: Num rows: 2 Data size: 6 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 keys: 0 key (type: int) 1 key (type: int) outputColumnNames: _col7 Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: _col7 (type: string) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Local Work: Map Reduce Local Work Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink PREHOOK: query: select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key PREHOOK: type: QUERY PREHOOK: Input: default@parquet_jointable1 PREHOOK: Input: default@parquet_jointable2 #### A masked pattern was here #### POSTHOOK: query: select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_jointable1 POSTHOOK: Input: default@parquet_jointable2 #### A masked pattern was here #### val_0value val_10value PREHOOK: query: -- SMB join create table parquet_jointable1_bucketed_sorted (key int,value string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@parquet_jointable1_bucketed_sorted POSTHOOK: query: -- SMB join create table parquet_jointable1_bucketed_sorted (key int,value string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@parquet_jointable1_bucketed_sorted PREHOOK: query: insert overwrite table parquet_jointable1_bucketed_sorted select key,concat(value,"value1") as value from staging cluster by key PREHOOK: type: QUERY PREHOOK: Input: default@staging PREHOOK: Output: default@parquet_jointable1_bucketed_sorted POSTHOOK: query: insert overwrite table parquet_jointable1_bucketed_sorted select key,concat(value,"value1") as value from staging cluster by key POSTHOOK: type: QUERY POSTHOOK: Input: default@staging POSTHOOK: Output: default@parquet_jointable1_bucketed_sorted POSTHOOK: Lineage: parquet_jointable1_bucketed_sorted.key SIMPLE [(staging)staging.FieldSchema(name:key, type:int, comment:null), ] POSTHOOK: Lineage: parquet_jointable1_bucketed_sorted.value EXPRESSION [(staging)staging.FieldSchema(name:value, type:string, comment:null), ] PREHOOK: query: create table parquet_jointable2_bucketed_sorted (key int,value1 string, value2 string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@parquet_jointable2_bucketed_sorted POSTHOOK: query: create table parquet_jointable2_bucketed_sorted (key int,value1 string, value2 string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@parquet_jointable2_bucketed_sorted PREHOOK: query: insert overwrite table parquet_jointable2_bucketed_sorted select key,concat(value,"value2-1") as value1,concat(value,"value2-2") as value2 from staging cluster by key PREHOOK: type: QUERY PREHOOK: Input: default@staging PREHOOK: Output: default@parquet_jointable2_bucketed_sorted POSTHOOK: query: insert overwrite table parquet_jointable2_bucketed_sorted select key,concat(value,"value2-1") as value1,concat(value,"value2-2") as value2 from staging cluster by key POSTHOOK: type: QUERY POSTHOOK: Input: default@staging POSTHOOK: Output: default@parquet_jointable2_bucketed_sorted POSTHOOK: Lineage: parquet_jointable2_bucketed_sorted.key SIMPLE [(staging)staging.FieldSchema(name:key, type:int, comment:null), ] POSTHOOK: Lineage: parquet_jointable2_bucketed_sorted.value1 EXPRESSION [(staging)staging.FieldSchema(name:value, type:string, comment:null), ] POSTHOOK: Lineage: parquet_jointable2_bucketed_sorted.value2 EXPRESSION [(staging)staging.FieldSchema(name:value, type:string, comment:null), ] PREHOOK: query: explain select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key PREHOOK: type: QUERY POSTHOOK: query: explain select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage Stage-0 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-1 Map Reduce Map Operator Tree: TableScan alias: p2 Statistics: Num rows: 2 Data size: 6 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE Sorted Merge Bucket Map Join Operator condition map: Inner Join 0 to 1 keys: 0 key (type: int) 1 key (type: int) outputColumnNames: _col1, _col7 Select Operator expressions: _col1 (type: string), _col7 (type: string) outputColumnNames: _col0, _col1 File Output Operator compressed: false table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink PREHOOK: query: select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key PREHOOK: type: QUERY PREHOOK: Input: default@parquet_jointable1_bucketed_sorted PREHOOK: Input: default@parquet_jointable2_bucketed_sorted #### A masked pattern was here #### POSTHOOK: query: select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_jointable1_bucketed_sorted POSTHOOK: Input: default@parquet_jointable2_bucketed_sorted #### A masked pattern was here #### val_0value1 val_0value2-2 val_10value1 val_10value2-2