PREHOOK: query: -- This tests inferring how data is bucketed/sorted from the operators in the reducer -- and populating that information in partitions' metadata. In particular, those cases -- where multi insert is used. CREATE TABLE test_table (key STRING, value STRING) PARTITIONED BY (part STRING) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@test_table POSTHOOK: query: -- This tests inferring how data is bucketed/sorted from the operators in the reducer -- and populating that information in partitions' metadata. In particular, those cases -- where multi insert is used. CREATE TABLE test_table (key STRING, value STRING) PARTITIONED BY (part STRING) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@test_table PREHOOK: query: -- Simple case, neither partition should be bucketed or sorted FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, value INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT value, key PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@test_table@part=1 PREHOOK: Output: default@test_table@part=2 POSTHOOK: query: -- Simple case, neither partition should be bucketed or sorted FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, value INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT value, key POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@test_table@part=1 POSTHOOK: Output: default@test_table@part=2 POSTHOOK: Lineage: test_table PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=2).value SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [1] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 500 rawDataSize 5312 totalSize 5812 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [2] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 500 rawDataSize 5312 totalSize 5812 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 PREHOOK: query: -- The partitions should be bucketed and sorted by different keys FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, COUNT(*) GROUP BY key INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT COUNT(*), value GROUP BY value PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@test_table@part=1 PREHOOK: Output: default@test_table@part=2 POSTHOOK: query: -- The partitions should be bucketed and sorted by different keys FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, COUNT(*) GROUP BY key INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT COUNT(*), value GROUP BY value POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@test_table@part=1 POSTHOOK: Output: default@test_table@part=2 POSTHOOK: Lineage: test_table PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=1).value EXPRESSION [(src)src.null, ] POSTHOOK: Lineage: test_table PARTITION(part=2).key EXPRESSION [(src)src.null, ] POSTHOOK: Lineage: test_table PARTITION(part=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [1] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 309 rawDataSize 1482 totalSize 1791 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: 1 Bucket Columns: [key] Sort Columns: [Order(col:key, order:1)] Storage Desc Params: serialization.format 1 PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [2] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 309 rawDataSize 2718 totalSize 3027 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: 1 Bucket Columns: [value] Sort Columns: [Order(col:value, order:1)] Storage Desc Params: serialization.format 1 PREHOOK: query: -- The first partition should be bucketed and sorted, the second should not FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, COUNT(*) GROUP BY key INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT key, value PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@test_table@part=1 PREHOOK: Output: default@test_table@part=2 POSTHOOK: query: -- The first partition should be bucketed and sorted, the second should not FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, COUNT(*) GROUP BY key INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT key, value POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@test_table@part=1 POSTHOOK: Output: default@test_table@part=2 POSTHOOK: Lineage: test_table PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=1).value EXPRESSION [(src)src.null, ] POSTHOOK: Lineage: test_table PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [1] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 309 rawDataSize 1482 totalSize 1791 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: 1 Bucket Columns: [key] Sort Columns: [Order(col:key, order:1)] Storage Desc Params: serialization.format 1 PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [2] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 500 rawDataSize 5312 totalSize 5812 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 PREHOOK: query: -- Test the multi group by single reducer optimization -- Both partitions should be bucketed by key FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, COUNT(*) GROUP BY key INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT key, SUM(SUBSTR(value, 5)) GROUP BY key PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@test_table@part=1 PREHOOK: Output: default@test_table@part=2 POSTHOOK: query: -- Test the multi group by single reducer optimization -- Both partitions should be bucketed by key FROM src INSERT OVERWRITE TABLE test_table PARTITION (part = '1') SELECT key, COUNT(*) GROUP BY key INSERT OVERWRITE TABLE test_table PARTITION (part = '2') SELECT key, SUM(SUBSTR(value, 5)) GROUP BY key POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@test_table@part=1 POSTHOOK: Output: default@test_table@part=2 POSTHOOK: Lineage: test_table PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=1).value EXPRESSION [(src)src.null, ] POSTHOOK: Lineage: test_table PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_table PARTITION(part=2).value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '1') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [1] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 309 rawDataSize 1482 totalSize 1791 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: 1 Bucket Columns: [key] Sort Columns: [Order(col:key, order:1)] Storage Desc Params: serialization.format 1 PREHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') PREHOOK: type: DESCTABLE PREHOOK: Input: default@test_table POSTHOOK: query: DESCRIBE FORMATTED test_table PARTITION (part = '2') POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_table # col_name data_type comment key string value string # Partition Information # col_name data_type comment part string # Detailed Partition Information Partition Value: [2] Database: default Table: test_table #### A masked pattern was here #### Protect Mode: None #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 309 rawDataSize 2690 totalSize 2999 #### A masked pattern was here #### # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: 1 Bucket Columns: [key] Sort Columns: [Order(col:key, order:1)] Storage Desc Params: serialization.format 1