set hive.auto.convert.join=true; set hive.optimize.correlation=false; EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; set hive.optimize.correlation=true; EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; set hive.auto.convert.join=true; set hive.auto.convert.join.noconditionaltask=true; set hive.auto.convert.join.noconditionaltask.size=10000000000; set hive.optimize.correlation=false; -- Without correlation optimizer, we will have 3 MR jobs. -- The first one is a MapJoin and Aggregation (in the Reduce Phase). -- The second one is another MapJoin. The third one is for ordering. -- With the correlation optimizer, right now, we have -- 2 MR jobs. The first one will evaluate the sub-query xx and the join of -- xx and yy. The second one will do the ORDER BY. EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; set hive.optimize.correlation=true; EXPLAIN SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value; SELECT xx.key, xx.cnt, yy.key, yy.value FROM (SELECT x.key AS key, count(1) AS cnt FROM src x JOIN src1 y ON (x.key = y.key) GROUP BY x.key) xx JOIN src1 yy ON xx.key=yy.key ORDER BY xx.key, xx.cnt, yy.key, yy.value;