Saving all output to "!!{outputDirectory}!!/groupby_sort_1.q.raw". Enter "record" with no arguments to stop it. >>> !run !!{qFileDirectory}!!/groupby_sort_1.q >>> set hive.enforce.bucketing = true; No rows affected >>> set hive.enforce.sorting = true; No rows affected >>> set hive.exec.reducers.max = 10; No rows affected >>> set hive.map.groupby.sorted=true; No rows affected >>> >>> CREATE TABLE T1(key STRING, val STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; No rows affected >>> >>> LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; No rows affected >>> >>> -- perform an insert to make sure there are 2 files >>> INSERT OVERWRITE TABLE T1 select key, val from T1; 'key','val' No rows selected >>> >>> CREATE TABLE outputTbl1(key int, cnt int); No rows affected >>> >>> -- The plan should be converted to a map-side group by if the group by key >>> -- matches the skewed key >>> -- addind a order by at the end to make the test results deterministic >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T1 GROUP BY key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 154 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T1 GROUP BY key; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '1','1' '2','1' '3','1' '7','1' '8','2' 5 rows selected >>> >>> CREATE TABLE outputTbl2(key1 int, key2 string, cnt int); No rows affected >>> >>> -- no map-side group by even if the group by key is a superset of skewed key >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: key, val' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' mode: hash' ' outputColumnNames: _col0, _col1, _col2' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' sort order: ++' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' tag: -1' ' value expressions:' ' expr: _col2' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' expr: KEY._col1' ' type: string' ' mode: mergepartial' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' expr: _col2' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: string' ' expr: UDFToInteger(_col2)' ' type: int' ' outputColumnNames: _col0, _col1, _col2' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,cnt' ' columns.types int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl2' ' name groupby_sort_1.outputtbl2' ' serialization.ddl struct outputtbl2 { i32 key1, string key2, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl2' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,cnt' ' columns.types int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl2' ' name groupby_sort_1.outputtbl2' ' serialization.ddl struct outputtbl2 { i32 key1, string key2, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl2' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 190 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl2 SELECT key, val, count(1) FROM T1 GROUP BY key, val; '_col0','_col1','_col2' No rows selected >>> >>> SELECT * FROM outputTbl2 ORDER BY key1, key2; 'key1','key2','cnt' '1','11','1' '2','12','1' '3','13','1' '7','17','1' '8','18','1' '8','28','1' 6 rows selected >>> >>> -- It should work for sub-queries >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: _col0' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' outputColumnNames: _col0' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: _col0' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [subq1:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 169 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM (SELECT key, val FROM T1) subq1 GROUP BY key; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '1','1' '2','1' '3','1' '7','1' '8','2' 5 rows selected >>> >>> -- It should work for sub-queries with column aliases >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL k))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: _col0' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' outputColumnNames: _col0' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: _col0' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [subq1:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 169 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT k, count(1) FROM (SELECT key as k, val as v FROM T1) subq1 GROUP BY k; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '1','1' '2','1' '3','1' '7','1' '8','2' 5 rows selected >>> >>> CREATE TABLE outputTbl3(key1 int, key2 int, cnt int); No rows affected >>> >>> -- The plan should be converted to a map-side group by if the group by key contains a constant followed >>> -- by a match to the skewed key >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl3))) (TOK_SELECT (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY 1 (TOK_TABLE_OR_COL key))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: 1' ' type: int' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: int' ' expr: _col1' ' type: string' ' expr: _col2' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' expr: UDFToInteger(_col2)' ' type: int' ' outputColumnNames: _col0, _col1, _col2' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,cnt' ' columns.types int:int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl3' ' name groupby_sort_1.outputtbl3' ' serialization.ddl struct outputtbl3 { i32 key1, i32 key2, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl3' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,cnt' ' columns.types int:int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl3' ' name groupby_sort_1.outputtbl3' ' serialization.ddl struct outputtbl3 { i32 key1, i32 key2, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl3' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 160 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl3 SELECT 1, key, count(1) FROM T1 GROUP BY 1, key; '_col0','_col1','_col2' No rows selected >>> >>> SELECT * FROM outputTbl3 ORDER BY key1, key2; 'key1','key2','cnt' '1','1','1' '1','2','1' '1','3','1' '1','7','1' '1','8','2' 5 rows selected >>> >>> CREATE TABLE outputTbl4(key1 int, key2 int, key3 string, cnt int); No rows affected >>> >>> -- no map-side group by if the group by key contains a constant followed by another column >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl4))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: key, val' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: 1' ' type: int' ' expr: val' ' type: string' ' mode: hash' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' sort order: +++' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' tag: -1' ' value expressions:' ' expr: _col3' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' expr: KEY._col1' ' type: int' ' expr: KEY._col2' ' type: string' ' mode: mergepartial' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: _col3' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: UDFToInteger(_col3)' ' type: int' ' outputColumnNames: _col0, _col1, _col2, _col3' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 202 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T1 GROUP BY key, 1, val; '_col0','_col1','_col2','_col3' No rows selected >>> >>> SELECT * FROM outputTbl4 ORDER BY key1, key2, key3; 'key1','key2','key3','cnt' '1','1','11','1' '2','1','12','1' '3','1','13','1' '7','1','17','1' '8','1','18','1' '8','1','28','1' 6 rows selected >>> >>> -- no map-side group by if the group by key contains a function >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl3 SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl3))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) 1)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (+ (TOK_TABLE_OR_COL key) 1))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: (key + 1)' ' type: double' ' mode: hash' ' outputColumnNames: _col0, _col1, _col2' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: double' ' sort order: ++' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: double' ' tag: -1' ' value expressions:' ' expr: _col2' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' expr: KEY._col1' ' type: double' ' mode: mergepartial' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: double' ' expr: _col2' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' expr: UDFToInteger(_col2)' ' type: int' ' outputColumnNames: _col0, _col1, _col2' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,cnt' ' columns.types int:int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl3' ' name groupby_sort_1.outputtbl3' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 25' ' serialization.ddl struct outputtbl3 { i32 key1, i32 key2, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl3' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,cnt' ' columns.types int:int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl3' ' name groupby_sort_1.outputtbl3' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 25' ' serialization.ddl struct outputtbl3 { i32 key1, i32 key2, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl3' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 198 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl3 SELECT key, key + 1, count(1) FROM T1 GROUP BY key, key + 1; '_col0','_col1','_col2' No rows selected >>> >>> SELECT * FROM outputTbl3 ORDER BY key1, key2; 'key1','key2','cnt' '1','2','1' '2','3','1' '3','4','1' '7','8','1' '8','9','2' 5 rows selected >>> >>> -- it should not matter what follows the group by >>> -- test various cases >>> >>> -- group by followed by another group by >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key + key, sum(cnt) from (SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 group by key + key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL cnt)))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Group By Operator' ' aggregations:' ' expr: sum(_col1)' ' bucketGroup: false' ' keys:' ' expr: (_col0 + _col0)' ' type: double' ' mode: hash' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: double' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: double' ' tag: -1' ' value expressions:' ' expr: _col1' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [subq1:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: sum(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: double' ' mode: mergepartial' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: double' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 209 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT key + key, sum(cnt) from (SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 group by key + key; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '2','1' '4','1' '6','1' '14','1' '16','2' 5 rows selected >>> >>> -- group by followed by a union >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT * FROM ( SELECT key, count(1) FROM T1 GROUP BY key UNION ALL SELECT key, count(1) FROM T1 GROUP BY key ) subq1; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' null-subquery1:subq1-subquery1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Union' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 17' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 22' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' null-subquery2:subq1-subquery2:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Union' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 17' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 22' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [null-subquery1:subq1-subquery1:t1, null-subquery2:subq1-subquery2:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 17' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 22' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 243 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT * FROM ( SELECT key, count(1) FROM T1 GROUP BY key UNION ALL SELECT key, count(1) FROM T1 GROUP BY key ) subq1; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '1','1' '1','1' '2','1' '2','1' '3','1' '3','1' '7','1' '7','1' '8','2' '8','2' 10 rows selected >>> >>> -- group by followed by a union where one of the sub-queries is map-side group by >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT * FROM ( SELECT key, count(1) FROM T1 GROUP BY key UNION ALL SELECT key + key as key, count(1) FROM T1 GROUP BY key + key ) subq1; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)) key) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))' '' 'STAGE DEPENDENCIES:' ' Stage-4 is a root stage' ' Stage-2 depends on stages: Stage-4' ' Stage-0 depends on stages: Stage-2' ' Stage-3 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-4' ' Map Reduce' ' Alias -> Map Operator Tree:' ' null-subquery2:subq1-subquery2:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: (key + key)' ' type: double' ' mode: hash' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: double' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: double' ' tag: -1' ' value expressions:' ' expr: _col1' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [null-subquery2:subq1-subquery2:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: double' ' mode: mergepartial' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: double' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 0' ' directory: file:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' table:' ' input format: org.apache.hadoop.mapred.SequenceFileInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' ' properties:' ' columns _col0,_col1' ' columns.types double,bigint' ' escape.delim \' ' TotalFiles: 1' ' GatherStats: false' ' MultiFileSpray: false' '' ' Stage: Stage-2' ' Map Reduce' ' Alias -> Map Operator Tree:' ' file:!!{hive.exec.scratchdir}!! ' ' TableScan' ' GatherStats: false' ' Union' ' Select Operator' ' expressions:' ' expr: _col0' ' type: double' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 10' ' rawDataSize 30' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 40' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' null-subquery1:subq1-subquery1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Union' ' Select Operator' ' expressions:' ' expr: _col0' ' type: double' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 10' ' rawDataSize 30' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 40' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' file:!!{hive.exec.scratchdir}!! [file:!!{hive.exec.scratchdir}!!]' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [null-subquery1:subq1-subquery1:t1]' ' Path -> Partition:' ' file:!!{hive.exec.scratchdir}!! ' ' Partition' ' base file name: -mr-10002' ' input format: org.apache.hadoop.mapred.SequenceFileInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' ' properties:' ' columns _col0,_col1' ' columns.types double,bigint' ' escape.delim \' ' ' ' input format: org.apache.hadoop.mapred.SequenceFileInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' ' properties:' ' columns _col0,_col1' ' columns.types double,bigint' ' escape.delim \' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 10' ' rawDataSize 30' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 40' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-3' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 359 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT * FROM ( SELECT key, count(1) as cnt FROM T1 GROUP BY key UNION ALL SELECT key + key as key, count(1) as cnt FROM T1 GROUP BY key + key ) subq1; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '','1' '','1' '','1' '','1' '','2' '1','1' '2','1' '3','1' '7','1' '8','2' 10 rows selected >>> >>> -- group by followed by a join >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT subq1.key, subq1.cnt+subq2.cnt FROM (SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 JOIN (SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq2 ON subq1.key = subq2.key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key)) (TOK_SELEXPR (+ (. (TOK_TABLE_OR_COL subq1) cnt) (. (TOK_TABLE_OR_COL subq2) cnt))))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: 0' ' value expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' subq2:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: 1' ' value expressions:' ' expr: _col1' ' type: bigint' ' Needs Tagging: true' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [subq1:t1, subq2:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Join Operator' ' condition map:' ' Inner Join 0 to 1' ' condition expressions:' ' 0 {VALUE._col0} {VALUE._col1}' ' 1 {VALUE._col1}' ' handleSkewJoin: false' ' outputColumnNames: _col0, _col1, _col3' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: (_col1 + _col3)' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 2' ' numPartitions 0' ' numRows 10' ' rawDataSize 35' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 45' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 2' ' numPartitions 0' ' numRows 10' ' rawDataSize 35' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 45' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 231 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT subq1.key, subq1.cnt+subq2.cnt FROM (SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq1 JOIN (SELECT key, count(1) as cnt FROM T1 GROUP BY key) subq2 ON subq1.key = subq2.key; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '1','2' '2','2' '3','2' '7','2' '8','4' 5 rows selected >>> >>> -- group by followed by a join where one of the sub-queries can be performed in the mapper >>> EXPLAIN EXTENDED SELECT * FROM (SELECT key, count(1) FROM T1 GROUP BY key) subq1 JOIN (SELECT key, val, count(1) FROM T1 GROUP BY key, val) subq2 ON subq1.key = subq2.key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))' '' 'STAGE DEPENDENCIES:' ' Stage-2 is a root stage' ' Stage-1 depends on stages: Stage-2' ' Stage-0 is a root stage' '' 'STAGE PLANS:' ' Stage: Stage-2' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq2:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: key, val' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' mode: hash' ' outputColumnNames: _col0, _col1, _col2' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' sort order: ++' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' tag: -1' ' value expressions:' ' expr: _col2' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [subq2:t1]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' expr: KEY._col1' ' type: string' ' mode: mergepartial' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' expr: _col2' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2' ' File Output Operator' ' compressed: false' ' GlobalTableId: 0' ' directory: file:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' table:' ' input format: org.apache.hadoop.mapred.SequenceFileInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' ' properties:' ' columns _col0,_col1,_col2' ' columns.types string,string,bigint' ' escape.delim \' ' TotalFiles: 1' ' GatherStats: false' ' MultiFileSpray: false' '' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' $INTNAME ' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: 1' ' value expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' expr: _col2' ' type: bigint' ' subq1:t1 ' ' TableScan' ' alias: t1' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: 0' ' value expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' Needs Tagging: true' ' Path -> Alias:' ' file:!!{hive.exec.scratchdir}!! [$INTNAME]' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 [subq1:t1]' ' Path -> Partition:' ' file:!!{hive.exec.scratchdir}!! ' ' Partition' ' base file name: -mr-10002' ' input format: org.apache.hadoop.mapred.SequenceFileInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' ' properties:' ' columns _col0,_col1,_col2' ' columns.types string,string,bigint' ' escape.delim \' ' ' ' input format: org.apache.hadoop.mapred.SequenceFileInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' ' properties:' ' columns _col0,_col1,_col2' ' columns.types string,string,bigint' ' escape.delim \' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1 ' ' Partition' ' base file name: t1' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t1' ' name groupby_sort_1.t1' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t1 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t1' ' name: groupby_sort_1.t1' ' Reduce Operator Tree:' ' Join Operator' ' condition map:' ' Inner Join 0 to 1' ' condition expressions:' ' 0 {VALUE._col0} {VALUE._col1}' ' 1 {VALUE._col0} {VALUE._col1} {VALUE._col2}' ' handleSkewJoin: false' ' outputColumnNames: _col0, _col1, _col2, _col3, _col4' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' expr: _col2' ' type: string' ' expr: _col3' ' type: string' ' expr: _col4' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2, _col3, _col4' ' File Output Operator' ' compressed: false' ' GlobalTableId: 0' ' directory: file:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: file:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' columns _col0,_col1,_col2,_col3,_col4' ' columns.types string:bigint:string:string:bigint' ' escape.delim \' ' serialization.format 1' ' TotalFiles: 1' ' GatherStats: false' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Fetch Operator' ' limit: -1' '' '' 316 rows selected >>> >>> CREATE TABLE T2(key STRING, val STRING) CLUSTERED BY (key, val) SORTED BY (key, val) INTO 2 BUCKETS STORED AS TEXTFILE; No rows affected >>> >>> -- perform an insert to make sure there are 2 files >>> INSERT OVERWRITE TABLE T2 select key, val from T1; 'key','val' No rows selected >>> >>> -- no mapside sort group by if the group by is a prefix of the sorted key >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T2 GROUP BY key; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t2 ' ' TableScan' ' alias: t2' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: hash' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: -1' ' value expressions:' ' expr: _col1' ' type: bigint' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 [t2]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 ' ' Partition' ' base file name: t2' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t2' ' name: groupby_sort_1.t2' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' mode: mergepartial' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key,cnt' ' columns.types int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl1' ' name groupby_sort_1.outputtbl1' ' numFiles 1' ' numPartitions 0' ' numRows 5' ' rawDataSize 15' ' serialization.ddl struct outputtbl1 { i32 key, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 20' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl1' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 186 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl1 SELECT key, count(1) FROM T2 GROUP BY key; '_col0','_col1' No rows selected >>> >>> SELECT * FROM outputTbl1 ORDER BY key; 'key','cnt' '1','1' '2','1' '3','1' '7','1' '8','2' 5 rows selected >>> >>> -- The plan should be converted to a map-side group by if the group by key contains a constant in between the >>> -- skewed keys >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl4))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t2 ' ' TableScan' ' alias: t2' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: key, val' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: 1' ' type: int' ' expr: val' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: _col3' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: UDFToInteger(_col3)' ' type: int' ' outputColumnNames: _col0, _col1, _col2, _col3' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 48' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 54' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 [t2]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 ' ' Partition' ' base file name: t2' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t2' ' name: groupby_sort_1.t2' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 48' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 54' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 178 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl4 SELECT key, 1, val, count(1) FROM T2 GROUP BY key, 1, val; '_col0','_col1','_col2','_col3' No rows selected >>> >>> SELECT * FROM outputTbl4 ORDER BY key1, key2, key3; 'key1','key2','key3','cnt' '1','1','11','1' '2','1','12','1' '3','1','13','1' '7','1','17','1' '8','1','18','1' '8','1','28','1' 6 rows selected >>> >>> CREATE TABLE outputTbl5(key1 int, key2 int, key3 string, key4 int, cnt int); No rows affected >>> >>> -- The plan should be converted to a map-side group by if the group by key contains a constant in between the >>> -- skewed keys followed by anything >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl5))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) 1 (TOK_TABLE_OR_COL val) 2)))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t2 ' ' TableScan' ' alias: t2' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: key, val' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: 1' ' type: int' ' expr: val' ' type: string' ' expr: 2' ' type: int' ' mode: final' ' outputColumnNames: _col0, _col1, _col2, _col3, _col4' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: _col3' ' type: int' ' expr: _col4' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2, _col3, _col4' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: _col3' ' type: int' ' expr: UDFToInteger(_col4)' ' type: int' ' outputColumnNames: _col0, _col1, _col2, _col3, _col4' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,key4,cnt' ' columns.types int:int:string:int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl5' ' name groupby_sort_1.outputtbl5' ' serialization.ddl struct outputtbl5 { i32 key1, i32 key2, string key3, i32 key4, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl5' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 [t2]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 ' ' Partition' ' base file name: t2' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t2' ' name: groupby_sort_1.t2' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,key4,cnt' ' columns.types int:int:string:int:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl5' ' name groupby_sort_1.outputtbl5' ' serialization.ddl struct outputtbl5 { i32 key1, i32 key2, string key3, i32 key4, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl5' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 174 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl5 SELECT key, 1, val, 2, count(1) FROM T2 GROUP BY key, 1, val, 2; '_col0','_col1','_col2','_col3','_col4' No rows selected >>> >>> SELECT * FROM outputTbl5 ORDER BY key1, key2, key3, key4; 'key1','key2','key3','key4','cnt' '1','1','11','2','1' '2','1','12','2','1' '3','1','13','2','1' '7','1','17','2','1' '8','1','18','2','1' '8','1','28','2','1' 6 rows selected >>> >>> -- contants from sub-queries should work fine >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 SELECT key, constant, val, count(1) from (SELECT key, 1 as constant, val from T2)subq group by key, constant, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl4))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant) (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq:t2 ' ' TableScan' ' alias: t2' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: 1' ' type: int' ' expr: val' ' type: string' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' outputColumnNames: _col0, _col1, _col2' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: _col3' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: UDFToInteger(_col3)' ' type: int' ' outputColumnNames: _col0, _col1, _col2, _col3' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 48' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 54' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 [subq:t2]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 ' ' Partition' ' base file name: t2' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t2' ' name: groupby_sort_1.t2' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 48' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 54' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 189 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl4 SELECT key, constant, val, count(1) from (SELECT key, 1 as constant, val from T2)subq group by key, constant, val; '_col0','_col1','_col2','_col3' No rows selected >>> >>> SELECT * FROM outputTbl4 ORDER BY key1, key2, key3; 'key1','key2','key3','cnt' '1','1','11','1' '2','1','12','1' '3','1','13','1' '7','1','17','1' '8','1','18','1' '8','1','28','1' 6 rows selected >>> >>> -- multiple levels of contants from sub-queries should work fine >>> EXPLAIN EXTENDED INSERT OVERWRITE TABLE outputTbl4 select key, constant3, val, count(1) from ( SELECT key, constant as constant2, val, 2 as constant3 from (SELECT key, 1 as constant, val from T2)subq )subq2 group by key, constant3, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 constant) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant) constant2) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR 2 constant3)))) subq2)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl4))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL constant3)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL constant3) (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-1 is a root stage' ' Stage-0 depends on stages: Stage-1' ' Stage-2 depends on stages: Stage-0' '' 'STAGE PLANS:' ' Stage: Stage-1' ' Map Reduce' ' Alias -> Map Operator Tree:' ' subq2:subq:t2 ' ' TableScan' ' alias: t2' ' GatherStats: false' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: _col0, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col2' ' type: string' ' expr: 2' ' type: int' ' outputColumnNames: _col0, _col2, _col3' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col3' ' type: int' ' expr: _col2' ' type: string' ' outputColumnNames: _col0, _col3, _col2' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: _col0' ' type: string' ' expr: _col3' ' type: int' ' expr: _col2' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: _col3' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2, _col3' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: int' ' expr: _col2' ' type: string' ' expr: UDFToInteger(_col3)' ' type: int' ' outputColumnNames: _col0, _col1, _col2, _col3' ' File Output Operator' ' compressed: false' ' GlobalTableId: 1' ' directory: pfile:!!{hive.exec.scratchdir}!!' ' NumFilesPerFileSink: 1' ' Stats Publishing Key Prefix: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 48' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 54' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' TotalFiles: 1' ' GatherStats: true' ' MultiFileSpray: false' ' Needs Tagging: false' ' Path -> Alias:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 [subq2:subq:t2]' ' Path -> Partition:' ' !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2 ' ' Partition' ' base file name: t2' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' ' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' SORTBUCKETCOLSPREFIX TRUE' ' bucket_count 2' ' bucket_field_name key' ' columns key,val' ' columns.types string:string' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/t2' ' name groupby_sort_1.t2' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 24' ' serialization.ddl struct t2 { string key, string val}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 30' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.t2' ' name: groupby_sort_1.t2' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' source: pfile:!!{hive.exec.scratchdir}!!' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' properties:' ' bucket_count -1' ' columns key1,key2,key3,cnt' ' columns.types int:int:string:int' ' file.inputformat org.apache.hadoop.mapred.TextInputFormat' ' file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' location !!{hive.metastore.warehouse.dir}!!/groupby_sort_1.db/outputtbl4' ' name groupby_sort_1.outputtbl4' ' numFiles 1' ' numPartitions 0' ' numRows 6' ' rawDataSize 48' ' serialization.ddl struct outputtbl4 { i32 key1, i32 key2, string key3, i32 cnt}' ' serialization.format 1' ' serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' totalSize 54' ' transient_lastDdlTime !!UNIXTIME!!' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.outputtbl4' ' tmp directory: pfile:!!{hive.exec.scratchdir}!!' '' ' Stage: Stage-2' ' Stats-Aggr Operator' ' Stats Aggregation Key Prefix: pfile:!!{hive.exec.scratchdir}!!' '' '' 196 rows selected >>> >>> INSERT OVERWRITE TABLE outputTbl4 select key, constant3, val, count(1) from ( SELECT key, constant as constant2, val, 2 as constant3 from (SELECT key, 1 as constant, val from T2)subq )subq2 group by key, constant3, val; '_col0','_col1','_col2','_col3' No rows selected >>> >>> SELECT * FROM outputTbl4 ORDER BY key1, key2, key3; 'key1','key2','key3','cnt' '1','2','11','1' '2','2','12','1' '3','2','13','1' '7','2','17','1' '8','2','18','1' '8','2','28','1' 6 rows selected >>> >>> set hive.map.aggr=true; No rows affected >>> set hive.multigroupby.singlereducer=false; No rows affected >>> set mapred.reduce.tasks=31; No rows affected >>> >>> CREATE TABLE DEST1(key INT, cnt INT); No rows affected >>> CREATE TABLE DEST2(key INT, val STRING, cnt INT); No rows affected >>> >>> SET hive.exec.compress.intermediate=true; No rows affected >>> SET hive.exec.compress.output=true; No rows affected >>> >>> EXPLAIN FROM T2 INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-2 is a root stage' ' Stage-0 depends on stages: Stage-2' ' Stage-3 depends on stages: Stage-0' ' Stage-1 depends on stages: Stage-2' ' Stage-4 depends on stages: Stage-1' '' 'STAGE PLANS:' ' Stage: Stage-2' ' Map Reduce' ' Alias -> Map Operator Tree:' ' t2 ' ' TableScan' ' alias: t2' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' outputColumnNames: key' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' mode: hash' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: -1' ' value expressions:' ' expr: _col1' ' type: bigint' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: key, val' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' expr: _col2' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: string' ' expr: UDFToInteger(_col2)' ' type: int' ' outputColumnNames: _col0, _col1, _col2' ' File Output Operator' ' compressed: true' ' GlobalTableId: 2' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest2' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' mode: mergepartial' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: true' ' GlobalTableId: 1' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest1' '' ' Stage: Stage-3' ' Stats-Aggr Operator' '' ' Stage: Stage-1' ' Move Operator' ' tables:' ' replace: true' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest2' '' ' Stage: Stage-4' ' Stats-Aggr Operator' '' '' 147 rows selected >>> >>> FROM T2 INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; '_col0','_col1','_col2' No rows selected >>> >>> select * from DEST1 ORDER BY key, cnt; 'key','cnt' '1','1' '2','1' '3','1' '7','1' '8','2' 5 rows selected >>> select * from DEST2 ORDER BY key, val, val; 'key','val','cnt' '1','11','1' '2','12','1' '3','13','1' '7','17','1' '8','18','1' '8','28','1' 6 rows selected >>> >>> -- multi-table insert with a sub-query >>> EXPLAIN FROM (select key, val from T2 where key = 8) x INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; 'Explain' 'ABSTRACT SYNTAX TREE:' ' (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 8)))) x)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME DEST2))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL val))))' '' 'STAGE DEPENDENCIES:' ' Stage-2 is a root stage' ' Stage-0 depends on stages: Stage-2' ' Stage-3 depends on stages: Stage-0' ' Stage-1 depends on stages: Stage-2' ' Stage-4 depends on stages: Stage-1' '' 'STAGE PLANS:' ' Stage: Stage-2' ' Map Reduce' ' Alias -> Map Operator Tree:' ' x:t2 ' ' TableScan' ' alias: t2' ' Filter Operator' ' predicate:' ' expr: (key = 8.0)' ' type: boolean' ' Select Operator' ' expressions:' ' expr: key' ' type: string' ' expr: val' ' type: string' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' outputColumnNames: _col0' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: _col0' ' type: string' ' mode: hash' ' outputColumnNames: _col0, _col1' ' Reduce Output Operator' ' key expressions:' ' expr: _col0' ' type: string' ' sort order: +' ' Map-reduce partition columns:' ' expr: _col0' ' type: string' ' tag: -1' ' value expressions:' ' expr: _col1' ' type: bigint' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' outputColumnNames: _col0, _col1' ' Group By Operator' ' aggregations:' ' expr: count(1)' ' bucketGroup: false' ' keys:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' mode: final' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: string' ' expr: _col2' ' type: bigint' ' outputColumnNames: _col0, _col1, _col2' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: _col1' ' type: string' ' expr: UDFToInteger(_col2)' ' type: int' ' outputColumnNames: _col0, _col1, _col2' ' File Output Operator' ' compressed: true' ' GlobalTableId: 2' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest2' ' Reduce Operator Tree:' ' Group By Operator' ' aggregations:' ' expr: count(VALUE._col0)' ' bucketGroup: false' ' keys:' ' expr: KEY._col0' ' type: string' ' mode: mergepartial' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: _col0' ' type: string' ' expr: _col1' ' type: bigint' ' outputColumnNames: _col0, _col1' ' Select Operator' ' expressions:' ' expr: UDFToInteger(_col0)' ' type: int' ' expr: UDFToInteger(_col1)' ' type: int' ' outputColumnNames: _col0, _col1' ' File Output Operator' ' compressed: true' ' GlobalTableId: 1' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest1' '' ' Stage: Stage-0' ' Move Operator' ' tables:' ' replace: true' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest1' '' ' Stage: Stage-3' ' Stats-Aggr Operator' '' ' Stage: Stage-1' ' Move Operator' ' tables:' ' replace: true' ' table:' ' input format: org.apache.hadoop.mapred.TextInputFormat' ' output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ' serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ' name: groupby_sort_1.dest2' '' ' Stage: Stage-4' ' Stats-Aggr Operator' '' '' 158 rows selected >>> >>> FROM (select key, val from T2 where key = 8) x INSERT OVERWRITE TABLE DEST1 SELECT key, count(1) GROUP BY key INSERT OVERWRITE TABLE DEST2 SELECT key, val, count(1) GROUP BY key, val; '_col0','_col1','_col2' No rows selected >>> >>> select * from DEST1 ORDER BY key, cnt; 'key','cnt' '8','2' 1 row selected >>> select * from DEST2 ORDER BY key, val, cnt; 'key','val','cnt' '8','18','1' '8','28','1' 2 rows selected >>> !record