-- create table with 1000 rows
create table srcorc(key string, value string) stored as textfile;
insert overwrite table srcorc select * from src;
insert into table srcorc select * from src;

-- load table with each row group having 1000 rows and stripe 1 & 2 having 5000 & 2000 rows respectively
create table if not exists vectororc
(s1 string,
s2 string,
d double,
s3 string)
stored as ORC tblproperties("orc.row.index.stride"="1000", "orc.stripe.size"="100000", "orc.compress.size"="10000");

-- insert creates separate orc files
insert overwrite table vectororc select "apple", "a", rand(1), "zoo" from srcorc;
set hive.optimize.constant.propagation=false;
insert into table vectororc select null, "b", rand(2), "zoo" from srcorc;
insert into table vectororc select null, "c", rand(3), "zoo" from srcorc;
insert into table vectororc select "apple", "d", rand(4), "zoo" from srcorc;
insert into table vectororc select null, "e", rand(5), "z" from srcorc;
insert into table vectororc select "apple", "f", rand(6), "z" from srcorc;
insert into table vectororc select null, "g", rand(7), "zoo" from srcorc;
set hive.optimize.constant.propagation=true;

-- since vectororc table has multiple orc file we will load them into a single file using another table
create table if not exists testorc
(s1 string,
s2 string,
d double,
s3 string)
stored as ORC tblproperties("orc.row.index.stride"="1000", "orc.stripe.size"="100000", "orc.compress.size"="10000");
insert overwrite table testorc select * from vectororc order by s2;

set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set hive.optimize.index.filter=true;

set hive.vectorized.execution.enabled=false;
-- row group (1,4) from stripe 1 and row group (1) from stripe 2
-- PPD ONLY
select count(*),int(sum(d)) from testorc where s1 is not null;
set hive.vectorized.execution.enabled=true;
-- VECTORIZATION + PPD
select count(*),int(sum(d)) from testorc where s1 is not null;

set hive.vectorized.execution.enabled=false;
-- row group (2,3,5) from stripe 1 and row group (2) from stripe 2
-- PPD ONLY
select count(*),int(sum(d)) from testorc where s2 in ("b", "c", "e", "g");
set hive.vectorized.execution.enabled=true;
-- VECTORIZATION + PPD
select count(*),int(sum(d)) from testorc where s2 in ("b", "c", "e", "g");

set hive.vectorized.execution.enabled=false;
-- last row group of stripe 1 and first row group of stripe 2
-- PPD ONLY
select count(*),int(sum(d)) from testorc where s3="z";
set hive.vectorized.execution.enabled=true;
-- VECTORIZATION + PPD
select count(*),int(sum(d)) from testorc where s3="z";

set hive.vectorized.execution.enabled=false;
-- first row group of stripe 1 and last row group of stripe 2
-- PPD ONLY
select count(*),int(sum(d)) from testorc where s2="a" or s2="g";
set hive.vectorized.execution.enabled=true;
-- VECTORIZATION + PPD
select count(*),int(sum(d)) from testorc where s2="a" or s2="g";

drop table srcorc;
drop table vectororc;
drop table testorc;