PREHOOK: query: -- Set the threshold to -1 to guarantee dictionary encoding is turned off -- Tests that the data can be read back correctly when a string column is stored -- without dictionary encoding CREATE TABLE test_orc (key STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' PREHOOK: type: CREATETABLE POSTHOOK: query: -- Set the threshold to -1 to guarantee dictionary encoding is turned off -- Tests that the data can be read back correctly when a string column is stored -- without dictionary encoding CREATE TABLE test_orc (key STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' POSTHOOK: type: CREATETABLE POSTHOOK: Output: default@test_orc PREHOOK: query: -- should be single split INSERT OVERWRITE TABLE test_orc SELECT key FROM src TABLESAMPLE (10 ROWS) PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@test_orc POSTHOOK: query: -- should be single split INSERT OVERWRITE TABLE test_orc SELECT key FROM src TABLESAMPLE (10 ROWS) POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@test_orc POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] PREHOOK: query: -- Test reading the column back SELECT * FROM test_orc PREHOOK: type: QUERY PREHOOK: Input: default@test_orc #### A masked pattern was here #### POSTHOOK: query: -- Test reading the column back SELECT * FROM test_orc POSTHOOK: type: QUERY POSTHOOK: Input: default@test_orc #### A masked pattern was here #### POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] 238 86 311 27 165 409 255 278 98 484 PREHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1') PREHOOK: type: ALTERTABLE_SERDEPROPERTIES PREHOOK: Input: default@test_orc PREHOOK: Output: default@test_orc POSTHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1') POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES POSTHOOK: Input: default@test_orc POSTHOOK: Output: default@test_orc POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] PREHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE PREHOOK: type: CREATETABLE POSTHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE POSTHOOK: type: CREATETABLE POSTHOOK: Output: default@src_thousand POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt' INTO TABLE src_thousand PREHOOK: type: LOAD PREHOOK: Output: default@src_thousand POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt' INTO TABLE src_thousand POSTHOOK: type: LOAD POSTHOOK: Output: default@src_thousand POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] PREHOOK: query: -- Add data to the table in such a way that alternate stripes encode the column -- differently. Setting orc.stripe.size = 1 guarantees the stripes each have -- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be -- above the cutoff of 50% and will be direct encoded. The second stripe -- will have 5 * 1 distinct rows and thus be under the cutoff and will be -- dictionary encoded. The final stripe will have 630 out of 1000 and be -- direct encoded. INSERT OVERWRITE TABLE test_orc SELECT key FROM ( SELECT CONCAT("a", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("b", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("c", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("d", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("e", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("f", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("g", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("h", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("i", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("j", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("k", key) AS key FROM src_thousand ) a ORDER BY key LIMIT 11000 PREHOOK: type: QUERY PREHOOK: Input: default@src_thousand PREHOOK: Output: default@test_orc POSTHOOK: query: -- Add data to the table in such a way that alternate stripes encode the column -- differently. Setting orc.stripe.size = 1 guarantees the stripes each have -- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be -- above the cutoff of 50% and will be direct encoded. The second stripe -- will have 5 * 1 distinct rows and thus be under the cutoff and will be -- dictionary encoded. The final stripe will have 630 out of 1000 and be -- direct encoded. INSERT OVERWRITE TABLE test_orc SELECT key FROM ( SELECT CONCAT("a", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("b", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("c", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("d", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("e", key) AS key FROM src_thousand UNION ALL SELECT CONCAT("f", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("g", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("h", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("i", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("j", 1) AS key FROM src_thousand UNION ALL SELECT CONCAT("k", key) AS key FROM src_thousand ) a ORDER BY key LIMIT 11000 POSTHOOK: type: QUERY POSTHOOK: Input: default@src_thousand POSTHOOK: Output: default@test_orc POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ] PREHOOK: query: SELECT SUM(HASH(key)) FROM test_orc PREHOOK: type: QUERY PREHOOK: Input: default@test_orc #### A masked pattern was here #### POSTHOOK: query: SELECT SUM(HASH(key)) FROM test_orc POSTHOOK: type: QUERY POSTHOOK: Input: default@test_orc #### A masked pattern was here #### POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ] 1082202951192