set hive.optimize.reducededuplication=true;
set hive.optimize.reducededuplication.min.reducer=1;
set hive.optimize.correlation=true;
-- This file is used to show plans of queries involving cluster by, distribute by,
-- order by, and sort by.
-- Right now, Correlation optimizer check the most restrictive condition
-- when determining if a ReduceSinkOperator is not necessary.
-- This condition is that two ReduceSinkOperators should have same sorting columns,
-- same partitioning columns, same sorting orders and no conflict on the numbers of reducers. 

-- Distribute by will not be optimized because distribute by does not introduce
-- sorting columns.
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key) yy
ON (xx.key=yy.key);

-- Sort by will not be optimized because sort by does not introduce partitioning columns
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x SORT BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y SORT BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=false;
-- Distribute by and sort by on the same key(s) should be optimized
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
ON (xx.key=yy.key);

SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=true;
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
ON (xx.key=yy.key);

SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=true;
-- Because for join we use ascending order, if sort by uses descending order,
-- this query will not be optimized
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key DESC) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key DESC) yy
ON (xx.key=yy.key);

-- Even if hive.optimize.reducededuplication.min.reducer=1, order by will not be optimized
-- because order by does not introduce partitioning columns
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x ORDER BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y ORDER BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=false;
-- Cluster by will be optimized
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
ON (xx.key=yy.key);

SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=true;
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
ON (xx.key=yy.key);

SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
JOIN
(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=false;
-- If hive.optimize.reducededuplication.min.reducer=1,
-- group by and then order by should be optimized
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
JOIN
(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
ON (xx.key=yy.key);

SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
JOIN
(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
ON (xx.key=yy.key);

set hive.optimize.correlation=true;
EXPLAIN
SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
JOIN
(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
ON (xx.key=yy.key);

SELECT xx.key, xx.value, yy.key, yy.value
FROM
(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
JOIN
(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
ON (xx.key=yy.key);