#!/usr/bin/env perl ############################################################################ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### # Nightly tests for pig. # # #PigSetup::setup(); #my $me = `whoami`; #chomp $me; $cfg = { 'driver' => 'Pig', 'nummachines' => 5, 'verify_with_pig' => 1, 'verify_pig_version' => 'old', 'groups' => [ { 'name' => 'Checkin', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); store a into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 50; d = filter b by age < 50; e = cogroup c by (name, age), d by (name, age) ; f = foreach e generate flatten(c), flatten(d); g = group f by registration; h = foreach g generate group, SUM(f.d::contributions); i = order h by $1; store i into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', 'sortArgs' => ['-t', ' ', '+1', '-2'], } ] }, { 'name' => 'LoaderDefaultDir', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/dir/studenttab10k' as (name, age, gpa); store a into ':OUTPATH:';\, }, ] }, { 'name' => 'LoaderPigStorageArg', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); store a into ':OUTPATH:';\, }, { # load with control character 'num' => 2, 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa); store a into ':OUTPATH:';#, }, { # load and store with control character 'num' => 3, 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa); store a into ':OUTPATH:.intermediate' using PigStorage('\\u0001'); b = load ':OUTPATH:.intermediate' using PigStorage('\\u0001') as (name, age, gpa); store b into ':OUTPATH:'; #, 'notmq' => 1, }, ] }, { # Results doctored, if you change this query you need to copy the # expected results into test/nightly/benchmarks 'name' => 'LoaderBinStorage', 'tests' => [ { 'num' => 1, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Swap(name, age), TOKENIZE((chararray)name), org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age); store b into ':OUTPATH:.intermediate' using BinStorage(); c = load ':OUTPATH:.intermediate' using BinStorage(); store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, 'notmq' => 1, }, ] }, { # Results doctored, if you change this query you need to copy the # expected results into test/nightly/benchmarks 'name' => 'LoaderTextLoader', 'tests' => [ { 'num' => 1, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/textdoc' using TextLoader(); b = foreach a generate TOKENIZE((chararray)$0); store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, }, ] }, { 'name' => 'FilterBoolean', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name == 'fred allen' and age > 50; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/dir/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name != 'fred allen' or age < 10; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by not (age == 50); store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob'); store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by age >= 50 or name > 'fred' and gpa <= 3.0 or name >= 'bob'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter <= and >= for chararray, int and double { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter <= and >= for bytearray, long and float { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); b = filter a by age >= 40 and age <=50 and gpa >= 2.0f and gpa <= 3.0f and name >= 'bob' and name <= 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter < and > for chararray, int and double { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = filter a by age > 40 and age <50 and gpa > 2.0 and gpa < 3.0 and name > 'bob' and name < 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter < and > for bytearray, long and float { 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); b = filter a by age > 40 and age <50 and gpa > 2.0f and gpa < 3.0f and name > 'bob' and name < 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter <= and >= for explicit cast for chararray, int and double { 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by (int)age >= 40 and (int)age <=50 and (double)gpa >= 2.0 and (double)gpa <= 3.0 and (chararray)name >= 'bob' and (chararray)name <= 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter <= and >= for explicit cast for bytearray, long and float { 'num' => 11, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by (long)age >= 40 and (long)age <=50 and (float)gpa >= 2.0f and (float)gpa <= 3.0f and name >= 'bob' and name <= 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter < and > for explicit cast for chararray, int and double { 'num' => 12, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test filter < and > for explicit cast for bytearray, long and float { 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by (long)age > 40 and (long)age <50 and (float)gpa > 2.0f and (float)gpa < 3.0f and name > 'bob' and name < 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test AND with nulls { 'num' => 14, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa); b = filter a by name == 'fred allen' and age > 50; store b into ':OUTPATH:' using PigStorage;\, }, # test OR with nulls { 'num' => 15, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa); b = filter a by name != 'fred allen' or age < 10; store b into ':OUTPATH:' using PigStorage;\, }, # test with nulls filter <= and >= for chararray, int and double { 'num' => 16, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, # test with nulls filter < and > for explicit cast for chararray, int and double { 'num' => 17, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa); b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 18, 'ignore' => 1, # PIG-2593 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate == 'true'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 19, 'ignore' => 1, # PIG-2593 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by not instate; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate == 'false'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 20, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate is null; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 21, 'ignore' => 1, # PIG-2593 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate == true; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate == 'true'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 22, 'ignore' => 1, # PIG-2593 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate == false; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); b = filter a by instate == 'false'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 23, 'ignore' => 1, # PIG-2593 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = filter a by instate; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = filter a by instate == 'true'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 24, 'ignore' => 1, # PIG-2593 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = filter a by not instate; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = filter a by instate == 'false'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 25, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = filter a by instate is null; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = filter a by instate is null; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 26, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = filter a by instate == true; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = filter a by instate == 'true'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 27, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = filter a by instate == false; store b into ':OUTPATH:' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = filter a by instate == 'false'; store b into ':OUTPATH:' using PigStorage;\, }, ], }, { 'name' => 'FilterEq', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name == 'alice johnson' and age == 64 and gpa == 3.99; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name > 'fred allen' and age > 40 and gpa > 2.50; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name >= 'fred allen' and age >= 40 and gpa >= 2.50; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name lt 'fred allen' and age < 40 and gpa < 2.50; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name lte 'fred allen' and age <= 40 and gpa <= 2.50; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage(); b = filter a by $0 neq 'fred allen' and $1 != '40' and $2 != '2.50'; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter == for chararray, int and double { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter == for bytearray, long and float { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42f; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter != for chararray, int and double { 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter != for bytearray, long and float { 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50f; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter == for explicit casts to chararray, int and double { 'num' => 11, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by (chararray)name == 'fred allen' and (int)age == 61 and (double)gpa == 1.42; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter == for explicit casts to bytearray, long and float { 'num' => 12, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name == 'fred allen' and (long)age == 61 and (float)gpa == 1.42f; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter != for explicit casts to chararray, int and double { 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ; b = filter a by (chararray)$0 != 'fred allen' and (int)$1 != 40 and (double)$2 != 2.50; store b into ':OUTPATH:' using PigStorage;\, }, # test for filter != for explicit casts to bytearray, long and float { 'num' => 14, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ; b = filter a by $0 != 'fred allen' and (long)$1 != 40 and (float)$2 != 2.50f; store b into ':OUTPATH:' using PigStorage;\, }, ] }, { 'name' => 'FilterMatches', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = filter a by name matches '^fred.*'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage(); b = filter a by not $0 matches '^fred.*'; store b into ':OUTPATH:' using PigStorage;\, }, { # test for filter on matches for chararray (declared and explicit cast) 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); b = filter a by name matches '^fred.*' and (chararray)registration matches '^dem.*'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); b = filter a by name matches 'f.ed' and (chararray)registration matches 'd.m'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); b = filter a by name matches 'f[^f]ed.*'; store b into ':OUTPATH:' using PigStorage;\, }, { 'num' => 6, 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '.*\\\\wan.*';\nstore b into ':OUTPATH:' using PigStorage;", }, { 'num' => 7, 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '^e.*\\\\sc.*';\nstore b into ':OUTPATH:' using PigStorage;", }, { 'num' => 8, 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches 'ethan white';\nstore b into ':OUTPATH:' using PigStorage;", }, { 'num' => 9, 'pig' => "a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);\nb = filter a by gpa matches '\\\\d\\\\.45';\nstore b into ':OUTPATH:' using PigStorage;", }, ] }, { 'name' => 'FilterUdf', 'tests' => [ { 'num' => 1, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = cogroup a by (name, age), b by (name, age); d = filter c by not IsEmpty(a); e = filter d by not IsEmpty(b); f = foreach e generate flatten(a), flatten(b); store f into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 50; d = filter b by age < 50; e = cogroup c by (name, age), d by (name, age); f = filter e by COUNT(c)> 0 AND COUNT(d)>0; store f into ':OUTPATH:';\, 'rc' => 0 }, ] }, # TODO Group that don't flatten via Agg functions { 'name' => 'GroupAggFunc', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, COUNT(a.age); store c into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = group a by $0; c = foreach b generate group, COUNT(a.$1); store c into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by (name, age); c = foreach b generate group.name, group.age, COUNT(a.gpa); store c into ':OUTPATH:';\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a all; c = foreach b generate COUNT(a.$0); store c into ':OUTPATH:';\, }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, SUM(a.age); store c into ':OUTPATH:';\, }, { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, SUM(a.gpa); store c into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.age); store c into ':OUTPATH:';\, }, { 'num' => 9, 'ignore23' => 'I cannot get it right due to float precision, temporarily disable', 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.gpa); store c into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, MIN(a.gpa); store c into ':OUTPATH:';\, }, { 'num' => 11, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, MAX(a.gpa); store c into ':OUTPATH:';\, }, { 'num' => 12, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by (name, age); c = foreach b generate flatten(group), SUM(a.gpa); store c into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by (name); c = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); d = cogroup b by group, c by name; e = foreach d generate flatten(group), SUM(c.gpa), COUNT(c.name); store e into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { 'num' => 14, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = group a by (name); e = foreach b generate COUNT(a.name); store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = group a by (name); e = foreach b generate COUNT(a.name); store e into ':OUTPATH:';\, } ], }, { 'name' => 'MapPartialAgg', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, COUNT(a.age); store c into ':OUTPATH:';\, 'java_params' => ['-Dpig.exec.mapPartAgg=true'] }, { #multiquery with group in one sub query 'num' => 2, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); b = filter a by age < 22; store b into ':OUTPATH:.1'; c = group b by age; d = foreach c generate group, SUM(b.gpa); store d into ':OUTPATH:.2'; #, 'java_params' => ['-Dpig.exec.mapPartAgg=true'] }, { #multi query with two group on diff columns 'num' => 3, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); g1 = group a by name; f1 = foreach g1 generate group as name, MAX(a.gpa); store f1 into ':OUTPATH:.1'; g2 = group a by age; f2 = foreach g2 generate group as age, AVG(a.gpa); store f2 into ':OUTPATH:.2'; #, 'java_params' => ['-Dpig.exec.mapPartAgg=true'] }, { #multi query with three groups on diff columns, group key being an expression 'num' => 4, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); g1 = group a by name; f1 = foreach g1 generate group as name, MAX(a.gpa); store f1 into ':OUTPATH:.1'; g2 = group a by age%10; f2 = foreach g2 generate group as age_mod10, AVG(a.gpa); store f2 into ':OUTPATH:.2'; g3 = group a by age; f3 = foreach g3 generate group%10, AVG(a.gpa); store f3 into ':OUTPATH:.3'; g4 = group a by gpa; f4 = foreach g4 generate group as gpa, COUNT(a); store f4 into ':OUTPATH:.4'; #, 'java_params' => ['-Dpig.exec.mapPartAgg=true'] }, { #aggregation gets more than one tuple for every tuple from load func 'num' => 5, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); b = foreach a generate name, age, gpa, flatten(TOBAG(age,age)) as x; c = group b by age; d = foreach c generate group, AVG(b.gpa); store d into ':OUTPATH:'; #, 'java_params' => ['-Dpig.exec.mapPartAgg=true'] }, ], }, { 'name' => 'EvalFunc', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by name lt 'b'; c = foreach b generate ARITY(name, age, gpa); store c into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); b = filter a by name lt 'b'; c = foreach b generate TOKENIZE(name); d = foreach c generate flatten($0); store d into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by name lt 'b'; c = foreach b generate org.apache.pig.test.udf.evalfunc.Swap(name, age); store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, }, { 'num' => 4, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by name lt 'b'; c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age); store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, }, { 'num' => 5, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = foreach a generate org.apache.pig.test.udf.evalfunc.TestBoolean(instate); store b into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = foreach a generate (instate is null ? '' : (instate == 'true' ? 'false' : 'true')); store b into ':OUTPATH:';\, } ] }, # TODO DIFF # TODO User defined grouping function { 'name' => 'CoGroupFlatten', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by name, d by name; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by $1 < 20; d = filter b by $1 < 20; e = cogroup c by $0, d by $0; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by (name, age), d by (name, age); f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); d = filter b by age < 20; e = cogroup a by (name, age) inner, d by (name, age); f = foreach e generate flatten (a), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; e = cogroup c by (name, age), b by (name, age) inner; f = foreach e generate flatten (c), flatten(b); store f into ':OUTPATH:';\, }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = cogroup a by (name, age) inner, b by (name, age) inner; f = foreach e generate flatten (a), flatten(b); store f into ':OUTPATH:';\, }, { # Test cogrouping data loaded from two separate loaders. We don't have any data that can join with studenttab that isn't also loaded with PigStorage, so the # first step is an intermediate load and store using BinStorage. 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); store a into ':OUTPATH:.intermediate' using BinStorage(); b = load ':OUTPATH:.intermediate' using BinStorage() as (name, age, gpa); c = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = cogroup b by (name, age) inner, c by (name, age) inner; f = foreach e generate flatten (b), flatten(c); store f into ':OUTPATH:';\, 'notmq' => 1, }, ] }, { 'name' => 'CoGroup', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = cogroup a by name, b by name; d = foreach c generate flatten(group), COUNT(a) + COUNT(b); store d into ':OUTPATH:';\, }, ] }, { 'name' => 'Join', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by name, d by name; store e into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by $0, d by $0; store e into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by (name, age), d by (name, age); store e into ':OUTPATH:';\, }, # self join with implict split # JIRA PIG-429 { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = filter a by $1 > 25; c = join a by $0, b by $0; store c into ':OUTPATH:';\, }, # join with one input having schema and another without # JIRA PIG-428 { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double); another = load ':INPATH:/singlefile/studenttab10k'; c = foreach another generate $0, $1+ 10, $2 + 10.0; d = join a by $0, c by $0; store d into ':OUTPATH:';\, }, # self join using fragment replicate join # no types { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = join a by name, b by name using 'repl'; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = join a by name, b by name ; store c into ':OUTPATH:';\, }, # self join using fragment replicate join # with types and no cast for join key { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); c = join a by name, b by name using 'repl'; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); c = join a by name, b by name ; store c into ':OUTPATH:';\, }, # self join using fragment replicate join # with types and cast for join key { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); c = join a by gpa, b by gpa using 'repl'; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); c = join a by gpa, b by gpa ; store c into ':OUTPATH:';\, }, # left outer join { 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double); c = join a by name left outer, b by name; store c into ':OUTPATH:';\, }, # right outer join { 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double); c = join a by name right outer, b by name; store c into ':OUTPATH:';\, }, # full outer join { 'num' => 11, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double); c = join a by name full outer, b by name; store c into ':OUTPATH:';\, }, # see PIG-1209 join package now uses internalcachedBag, so every tuple on reduce side in this test will spilled to disk. { 'num' => 12, 'java_params' => ['-Dpig.cachedbag.memusage=0'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by name, d by name; store e into ':OUTPATH:';\, }, { 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); c = filter a by age < 20; d = filter b by age < 20; e = join c by instate, d by instate parallel 5; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); c = filter a by age < 20; d = filter b by age < 20; e = join c by instate, d by instate parallel 5; store e into ':OUTPATH:';\, } ] }, { 'name' => 'Foreach', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate *; store b into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = foreach a generate *; store b into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, age; store b into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = foreach a generate $0, $2; store b into ':OUTPATH:';\, }, { # test filter, projection, sort , duplicate elimination 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by age < 20; c = group b by age; d = foreach c { cf = filter b by gpa < 3.0; cp = cf.gpa; cd = distinct cp; co = order cd by $0; generate group, flatten(co); } store d into ':OUTPATH:';\, }, { # test flatten for map and scalar 'num' => 6, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate flatten(name) as n, flatten(org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, gpa)) as m; store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, }, { # test flatten for UDF that returns bag with multiple tuples with multiple columns 'num' => 7, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, flatten(org.apache.pig.test.udf.evalfunc.CreateTupleBag(age, gpa)) as foo; store b into ':OUTPATH:';\, }, { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age: int, gpa); c = group a by name; d = foreach c generate flatten(group), MAX(a.age) + MIN(a.age); store d into ':OUTPATH:';\, }, { # test filter, projection, sort , duplicate elimination 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by age < 20; c = group b by age; d = foreach c { cf = filter b by gpa >= 3.0 and gpa <= 3.5; cp = cf.gpa; cd = distinct cp; co = order cd by $0; generate group, flatten(co); } store d into ':OUTPATH:';\, }, { # test filter, projection, sort , duplicate elimination 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by age < 20; c = group b by age; d = foreach c { cf = filter b by (gpa == 4.0 or gpa != 2.0) and name > 'a'; cp = cf.gpa; cd = distinct cp; co = order cd by $0; generate group, flatten(co); } store d into ':OUTPATH:';\, }, { # test filter, projection, sort , duplicate elimination 'num' => 11, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by age < 20; c = foreach b { exp1 = age + gpa; exp2 = exp1 + age; generate exp1, exp2; } store c into ':OUTPATH:';\, }, { # test a udf with no args 'num' => 12, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred; store b into ':OUTPATH:';\, }, { 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = foreach a generate *; store b into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); b = foreach a generate *; store b into ':OUTPATH:';\, } ] }, { 'name' => 'Order', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name; c = order b by name; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = foreach a generate $1; c = order b by $0; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate gpa; c = order b by gpa; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = order a by *; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' '], }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, age; c = order b by name, age; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-2'], }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; c = order a by $0; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; c = order a by $1; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1', '-2'], }, { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; c = order a by $0, $1; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-2'], }, { 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; c = order a by $1, $0; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1', '-2', '+0', '-1'], }, { 'num' => 10, 'ignore' => 'order by UDF is not supported', 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k'; c = order a by * using org.apache.pig.test.udf.orderby.OrdDesc; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-r'], }, { 'num' => 11, 'ignore' => 'order by UDF is not supported', 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k'; c = order a by $0 using org.apache.pig.test.udf.orderby.OrdDesc; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-r', '+0', '-1'], }, { 'num' => 12, 'ignore' => 'order by UDF is not supported', 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k'; c = order a by $0, $1 using org.apache.pig.test.udf.orderby.OrdDesc; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-r', '+0', '-2'], }, # ALERT All these tests with inner order bys aren't testing the inner # ordering. We need to develop a sorting tool to do that. { 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = group a by $0; c = foreach b {c1 = order $1 by $1; generate flatten(c1); }; store c into ':OUTPATH:';\, }, { 'num' => 14, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = group a by $0; c = foreach b {c1 = order $1 by *; generate flatten(c1); }; store c into ':OUTPATH:';\, }, { 'num' => 15, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k'; b = group a by $0; c = foreach b {c1 = order $1 by * using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1); }; store c into ':OUTPATH:';\, }, { 'num' => 16, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k'; b = group a by $0; c = foreach b {c1 = order $1 by $1 using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1);}; store c into ':OUTPATH:';\, }, { 'num' => 17, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = group a by $0; c = foreach b {c1 = order $1 by $1; generate flatten(c1), MAX($1.$1); }; store c into ':OUTPATH:';\, }, { # test to make sure the weighted range patitioning # works correctly when a sort key value repeats across # reduce partitions 'num' => 18, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = order a by $1 parallel 100; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1', '-2'], }, { 'num' => 19, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = foreach a generate instate; c = order b by instate; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); b = foreach a generate instate; c = order b by instate; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, ] }, { 'name' => 'Distinct', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name; c = distinct b; store c into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = foreach a generate $1; c = distinct b; store c into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate gpa; c = distinct b; store c into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = distinct a; store b into ':OUTPATH:';\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, age; c = distinct b; store c into ':OUTPATH:';\, }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b { aa = distinct a.age; generate group, COUNT(aa); } store c into ':OUTPATH:';\, } ] }, { 'name' => 'Cross', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 19 and gpa < 1.0; d = filter b by age < 19; e = cross c, d; store e into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 19 and gpa < 1.0; d = filter b by age < 19; e = cross c, d parallel 10; store e into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\set default_parallel 10; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 19 and gpa < 1.0; d = filter b by age < 19; e = cross c, d; store e into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 25; d = filter b by age < 25; e = cross c, d; f = filter e by c::age < d::age; store f into ':OUTPATH:';\, } ] }, { 'name' => 'Union', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = foreach a generate name, age; d = foreach b generate name, age; e = union c, d; store e into ':OUTPATH:';\, }, ] }, { 'name' => 'Bincond', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate name, (name matches 'yuri.*' ? age - 10 : (int)age); store b into ':OUTPATH:';\, }, ] }, { 'name' => 'Glob', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10?' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/st*ttab10k' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab*' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab???' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab[1-9]0[km]' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab[13]0[km]' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab[12]0[a-l]' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 8, 'pig' => q\a = load ':INPATH:/glob/star/*good' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, }, { 'num' => 9, 'pig' => q\a = load ':INPATH:/glob/star/*' as (name, age, gpa); b = filter a by name == 'nick miller'; store b into ':OUTPATH:';\, } ] }, { 'name' => 'Arithmetic', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate age + 1, (int)gpa + 1; store c into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate (double)age + 1.5, gpa + 1.5; store c into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate age - 30, (int)gpa - 3; store c into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate (double)age - 30.1, gpa - 3.199; store c into ':OUTPATH:';\, }, { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate age * 10, (int)gpa * 2; store c into ':OUTPATH:';\, }, { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate (double)age * 10.1, gpa * 2.752342; store c into ':OUTPATH:';\, }, { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate age / 30, (int)gpa / 3; store c into ':OUTPATH:';\, }, { 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate (double)age / 30.323, gpa / 3.22; store c into ':OUTPATH:';\, }, { 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate 3 * age + gpa / 9.1 - 2; store c into ':OUTPATH:';\, }, { 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = foreach a generate 3 * (age + gpa) / (9.1 - 2); store c into ':OUTPATH:';\, } ] }, { 'name' => 'Regression', 'tests' => [ { 'num' => 1459894, 'pig' => q\a = load ':INPATH:/singlefile/reg1459894'; b = group a by $0; c = foreach b generate group, COUNT(a.$1); store c into ':OUTPATH:';\, }, { 'num' => 97, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = cogroup a by name, b by name; f = foreach e generate group, COUNT(a), COUNT(b); store f into ':OUTPATH:';\, }, { 'num' => 203, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = group a by name; c = foreach b generate group, COUNT($1); store c into ':OUTPATH:'; --This is a really long script to test that when script size exceeds 1k we can still parse it. --The quick sly fox jumped over the lazy brown dog. --he quick sly fox jumped over the lazy brown dog.T --e quick sly fox jumped over the lazy brown dog.Th -- quick sly fox jumped over the lazy brown dog.The --quick sly fox jumped over the lazy brown dog.The --uick sly fox jumped over the lazy brown dog.The q --ick sly fox jumped over the lazy brown dog.The qu --ck sly fox jumped over the lazy brown dog.The qui --k sly fox jumped over the lazy brown dog.The quic -- sly fox jumped over the lazy brown dog.The quick --sly fox jumped over the lazy brown dog.The quick --ly fox jumped over the lazy brown dog.The quick s --y fox jumped over the lazy brown dog.The quick sl -- fox jumped over the lazy brown dog.The quick sly --fox jumped over the lazy brown dog.The quick sly --ox jumped over the lazy brown dog.The quick sly f --x jumped over the lazy brown dog.The quick sly fo -- jumped over the lazy brown dog.The quick sly fox --jumped over the lazy brown dog.The quick sly fox --umped over the lazy brown dog.The quick sly fox j --mped over the lazy brown dog.The quick sly fox ju --ped over the lazy brown dog.The quick sly fox jum\, } ] }, { 'name' => 'Unicode', 'tests' => [ { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/unicode100'; store a into ':OUTPATH:';\, }, ] }, { 'name' => 'Parameters', 'tests' => [ { # test default 'num' => 1, 'pig' => q\%default fname 'studenttab10k' a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); b = foreach a generate name; store b into ':OUTPATH:';\, }, { # test paramter from command line 'num' => 2, 'pig_params' => ['-p', qq(fname='studenttab10k')], 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); b = foreach a generate name; store b into ':OUTPATH:';\, }, { # test paramter from param file 'num' => 3, 'pig_params' => ['-m', ":PARAMPATH:/params_3"], 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); b = foreach a generate name; store b into ':OUTPATH:';\, }, { # test command 'num' => 4, 'pig' => q\%declare cmd `/usr/local/bin/perl -e 'print "studenttab10k"'` a = load ':INPATH:/singlefile/$cmd' using PigStorage() as (name, age, gpa); b = foreach a generate name; store b into ':OUTPATH:';\, }, { # test parameter with a space 'num' => 5, 'pig_params' => ['-p', qq(setting='set default_parallel 100;'),'-p',qq(fname='studenttab10k')], 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); $setting b = foreach a generate name; store b into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); b = foreach a generate name; store b into ':OUTPATH:';\, }, ] }, { 'name' => 'Types', 'tests' => [ { # constants 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate age + 1 + 0.2f + 253645L, gpa+1; store b into ':OUTPATH:';\, }, { # NULL and cast 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int; c = foreach b generate (norm_gpa is null? 0 :norm_gpa); store c into ':OUTPATH:';\, # 'expected_err_regex' => "Encountered Warning DIVIDE_BY_ZERO 2387 time.*", # Driver does currently not support both 'sql' and 'expected_...' verification directives. }, { # arithmetic operators and SIZE for int, double and size and concat operators for chararrays 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate age, gpa, age % 25, age + 25, age - 25, age/2, age * 2, SIZE(age), gpa + 10.1, gpa - 1.1 , gpa / 1.2, gpa * 2.5, SIZE(gpa), SIZE(name), CONCAT(name, 'test'); store b into ':OUTPATH:';\, }, { # arithmetic operators and SIZE for long, float and size and concat operators for bytearrays 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); b = foreach a generate age, gpa, age % 2L, age + 2500000000L, age - 2500000000L, age/2L, age * 250000000L, SIZE(age), gpa + 10.1f, gpa - 1.1f , gpa / 1.2f, gpa * 2.6f, SIZE(gpa), SIZE(name), CONCAT(name, name); store b into ':OUTPATH:';\, }, { # equlity and implicit cast 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); b = filter a by age == '25' and gpa < 3; store b into ':OUTPATH:';\, }, { # will need to test against previous version of pig # because in pig currently count includes nulls - this affects # avg 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = group a ALL; c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), MIN(a.name), MAX(a.name), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa); store c into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { # sum, min, max, avg for long and float (declared) 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); b = group a ALL; c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa); store c into ':OUTPATH:';\, }, { # Explicit casts - arithmetic operators and SIZE for int, double and size and concat operators for chararrays 'num' => 8, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); b = foreach a generate (int)age % 25, (int)age + 25, (int)age - 25, (int)age/2, (int)age * 2, SIZE((int)age), (double)gpa + 10.1, (double)gpa - 1.1 , (double)gpa / 1.2, (double)gpa * 2.5, SIZE((double)gpa), SIZE((chararray)name), CONCAT((chararray)name, 'test'); store b into ':OUTPATH:';\, }, { # Explicit casts - arithmetic operators and SIZE for long, float 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); b = foreach a generate (long)age, (long)age % 2L, (long)age + 2500000000L, (long)age - 2500000000L, (long)age/2L, (long)age * 250000000L, SIZE((long)age), (float)gpa + 10.1f, (float)gpa - 1.1f , (float)gpa / 1.2f, (float)gpa * 2.6f, SIZE((float)gpa); store b into ':OUTPATH:';\, }, { # Filter is null for chararray and double and is not null for int 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = filter a by name is null and age is not null and gpa is null; c = group b ALL; d = foreach c generate COUNT(b); store d into ':OUTPATH:';\, }, { # Filter is not null for chararray and double and is null for int 'num' => 11, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = filter a by name is not null and age is null and gpa is not null; c = group b ALL; d = foreach c generate COUNT(b); store d into ':OUTPATH:';\, }, { # Filter is null for bytearray and float and is not null for long 'num' => 12, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); b = filter a by name is null and age is not null and gpa is null; c = group b ALL; d = foreach c generate COUNT(b); store d into ':OUTPATH:';\, }, { # Filter is not null for bytearray and float and is null for long 'num' => 13, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); b = filter a by name is not null and age is null and gpa is not null; c = group b ALL; d = foreach c generate COUNT(b); store d into ':OUTPATH:';\, }, { # test that sorting is based on the type for chararray, int and double 'num' => 14, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by name, age, gpa; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1', '+1n', '-2'], }, { # test that sorting descending is based on the type for chararray, int and double 'num' => 15, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by name desc, age desc, gpa desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0r', '-1', '+1nr', '-2'], }, { # test that sorting is based on the type for bytearray, long and float 'num' => 16, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); b = order a by name, age, gpa; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1', '+1n', '-2'], }, { # test that sorting descending is based on the type for chararray, age and float 'num' => 17, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); b = order a by name desc, age desc, gpa desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0r', '-1', '+1nr', '-2'], }, { # test precision for doubles is atleast 15 digits 'num' => 18, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate 0.123456789123456+0.123456789123456; store b into ':OUTPATH:';\, }, { # order by string 'num' => 20, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by name; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { # order by string desc 'num' => 21, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by name desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0r', '-1'], }, { # order by int 'num' => 22, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by age; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1n', '-2'], }, { # order by int desc 'num' => 23, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by age desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1nr', '-2'], }, { # order by long 'num' => 24, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double); b = order a by age; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1n', '-2'], }, { # order by long desc 'num' => 25, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double); b = order a by age desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1nr', '-2'], }, { # order by float 'num' => 26, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); b = order a by gpa; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-k 3n'], }, { # order by float desc 'num' => 27, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); b = order a by gpa desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-k 3nr'], }, { # order by double 'num' => 28, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by gpa; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-k 3n'], }, { # order by double desc 'num' => 29, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by gpa desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-k 3nr'], }, { # order by * 'num' => 30, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by *; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1', '+1n', '-2'], }, { # order by * desc 'num' => 31, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = order a by * desc; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0r', '-1', '+1nr', '-2'], }, { 'num' => 32, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by name, d by name; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 33, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by age, d by age; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 34, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:long, registration:chararray, contributions:double); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by age, d by age; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 35, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:float, registration:chararray, contributions:double); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by age, d by age; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { 'num' => 36, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:double, registration:chararray, contributions:double); c = filter a by age < 20; d = filter b by age < 20; e = cogroup c by age, d by age; f = foreach e generate flatten (c), flatten(d); store f into ':OUTPATH:';\, }, { # NULL and cast 'num' => 37, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int; c = foreach b generate (norm_gpa is not null? norm_gpa: 0); store c into ':OUTPATH:';\, }, { # constants 'num' => 38, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate -(age + 1 + 0.2f + 253645L), -(gpa+1); store b into ':OUTPATH:';\, }, { 'num' => 39, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); b = foreach a generate instate, true, false; store b into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); b = foreach a generate instate, 'true', 'false'; store b into ':OUTPATH:';\, }, ] }, { 'name' => 'Limit', 'tests' => [ { 'num' => 1, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = order a by $0, $1; c = filter b by $0 > 'a'; -- break the sort/limit optimization d = limit c 100; store d into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { 'num' => 2, 'ignore23' => 'The record limit pick is different in 23', 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = order a by $0, $1; c = limit b 100; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { # Make sure that limit higher than number of rows doesn't mess stuff up 'num' => 3, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; b = order a by $0, $1; c = filter b by $1 < 1000; d = limit c 100000; store d into ':OUTPATH:';\, }, { 'num' => 4, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = distinct a; c = limit b 100; store c into ':OUTPATH:';\, }, { 'num' => 5, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; a1 = foreach a generate $0, $1; b1 = foreach b generate $0, $1; c = union a1, b1; d = limit c 100; store d into ':OUTPATH:';\, }, { 'num' => 6, 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); B = limit A 40; C = filter B by age == 40; D = group C by name; E = foreach D generate group, COUNT(C); store E into ':OUTPATH:';\, }, { 'num' => 7, 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); B = group A by name; C = foreach B { C1 = limit A 10; generate group, COUNT(C1); } store C into ':OUTPATH:';\, }, { 'num' => 8, 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); B = group A by name; C = foreach B { C1 = filter A by age < 40; C2 = limit C1 10; generate group, COUNT(C2); } D = filter C by $1 > 0; store D into ':OUTPATH:';\, }, { 'num' => 9, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = order a by $0, $1; c = limit b 1000/10; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = order a by $0, $1; c = limit b 100; store c into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '-k1,2'], }, { 'num' => 10, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = group a all; c = foreach b generate COUNT(a) as count; d = limit a c.count/10; store d into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = limit a 1000; store b into ':OUTPATH:';\, }, { 'num' => 11, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = group a all; c = foreach b generate COUNT(a) as count; d = load ':INPATH:/singlefile/votertab10k'; e = group d all; f = foreach e generate COUNT(d) as count; d = limit a c.count/10+f.count/10; store d into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = limit a 2000; store b into ':OUTPATH:';\, } ] }, { 'name' => 'Split', 'tests' => [ { 'num' => 1, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; split a into a1 if $0 > 'm', a2 if $0 <= 'm'; store a1 into ':OUTPATH:';\, }, { 'num' => 2, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; split a into a1 if $0 > 'm', a2 if $0 <= 'm'; store a2 into ':OUTPATH:';\, }, { 'num' => 3, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; split a into a1 if $0 > 'm', a2 if $0 <= 'm'; b = cogroup a1 by $1, a2 by $1; c = foreach b generate flatten(a1), flatten(a2); store c into ':OUTPATH:';\, }, { 'num' => 4, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; split a into a1 if $0 > 'm', a2 if $0 <= 'm'; b = cogroup a1 by $1, a2 by $1; c = foreach b generate flatten($1), flatten($2); store c into ':OUTPATH:';\, }, { 'num' => 5, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); split a into a1 if name > 'm', a2 if name <= 'm'; b = distinct a1; store b into ':OUTPATH:';\, }, { 'num' => 6, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); split a into a1 if age > 50, a2 if age <= 25; b = order a2 by name; store b into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+0', '-1'], }, { 'num' => 7, 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); split a into a1 if name > 'm', a2 if age < 50; b = distinct a1; store b into ':OUTPATH:';\, }, { 'num' => 8, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); split a into a1 if age > 50, a2 if name < 'm'; b2 = foreach a2 generate name, 1; b1 = foreach a1 generate name, 2; c = cogroup b2 by name, b1 by name; d = foreach c generate flatten(group), COUNT($1), COUNT($2); store d into ':OUTPATH:';\, }, { 'num' => 9, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); split a into a1 if age > 50, a2 if name < 'm'; b2 = distinct a2; b1 = order a1 by name; c = cogroup b2 by name, b1 by name; d = foreach c generate flatten(group), COUNT($1), COUNT($2); store d into ':OUTPATH:';\, }, { 'num' => 10, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); split a into a1 if age > 50, a2 otherwise; store a1 into ':OUTPATH:.1'; store a2 into ':OUTPATH:.2';\, 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); split a into a1 if age > 50, a2 if age<=50; store a1 into ':OUTPATH:.1'; store a2 into ':OUTPATH:.2';\, } ] }, { 'name' => 'ImplicitSplit', 'tests' => [ { 'num' => 1, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; b = filter a by $1 > 50; c = filter a by $2 > 3.0; d = cogroup b by $0, c by $0; e = foreach d generate flatten(b), flatten(c); store e into ':OUTPATH:';\, }, { 'num' => 2, 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = filter a by age > 50; c = filter a by gpa > 3.0; d = cogroup b by name, c by name; e = foreach d generate flatten(b), flatten(c); f = filter e by b::age < 75; store f into ':OUTPATH:';\, } ] }, { 'name' => 'describe', 'tests' => [ #JIRA[PIG-373] { 'num' => 1, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); describe A; store A into ':OUTPATH:';\, }, ], }, { 'name' => 'Sample', 'tests' => [ { 'num' => 1, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); S = sample A 2-1-1; store S into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); S = sample A 0; store S into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); B = group A all; C = foreach B generate COUNT(A) as count; D = group A all; E = foreach D generate (double)COUNT(A) as count; S = sample A E.count/C.count; store S into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); S = sample A 1; store S into ':OUTPATH:';\, }, ], }, { 'name' => 'MissingColumns', 'tests' => [ { 'num' => 1, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age: int, gpa: double, extra: chararray); B = filter A by age > 50 or extra > 'm'; D = order B by age, extra; store D into ':OUTPATH:';\, 'sortArgs' => ['-t', ' ', '+1n', '-2'], }, { 'num' => 2, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage(); B = foreach A generate $0, $1 + 1, $3 + 1; C = group B by ($0, $2); D = foreach C generate flatten(group), COUNT($1); store D into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: double); B = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa, extra1, extra2); C = join A by (name, age), B by (name, extra1); store C into ':OUTPATH:';\, # The following SQL should produce empty results, which will match what our pig query should produce. } ], }, { 'name' => 'Aliases', # check access of a field using multiple valid aliases 'tests' => [ { # check that a free standing alias reference works # when it is unambiguous # check that a fully qualified alias reference works # check that a partially qualified unambiguous alias reference works 'num' => 1, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); b = group a by name; c = foreach b generate flatten(a); d = filter c by name != 'fred'; e = group d by name; f = foreach e generate flatten(d); g = foreach f generate name, d::a::name as dname, a::name as aname; store g into ':OUTPATH:';\, }, { # check that the "group" alias is available # after a flatten(group) 'num' => 2, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); b = group a by name; c = foreach b generate flatten(group), COUNT(a) as cnt; d = foreach c generate group; store d into ':OUTPATH:';\, }, ], }, { 'name' => 'Lineage', #test if the right cast function is picked 'tests' => [ { 'num' => 1, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence); c = cogroup a ALL, b ALL; d = foreach c generate flatten(a), flatten(b); e = foreach d generate name, flatten(TOKENIZE((chararray)sentence)) as sentence; f = foreach e generate CONCAT((chararray)name, sentence); store f into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa: double); b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence); c = cross a, b; d = foreach c generate name, flatten(TOKENIZE((chararray)sentence)) as sentence; e = foreach d generate CONCAT((chararray)name, sentence); store e into ':OUTPATH:';\, }, { 'num' => 3, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa: double); b = foreach a generate age as student_age; c = filter b by student_age > 50; d = foreach c generate student_age + 10; store d into ':OUTPATH:';\, }, { 'num' => 4, 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by name lt 'b'; c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, (int)age); d = foreach c generate $0#'alice young'; split d into e if $0 is not null, f if $0 is null; store e into ':OUTPATH:';\, } ], }, { 'name' => 'Casts', 'tests' => [ { # check that a cast of a value of type # same as the result type of the cast works # when the value is treated as a bytearray 'num' => 1, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); b = foreach a generate name, age, gpa; store b into ':OUTPATH:.intermediate' using BinStorage(); c = load ':OUTPATH:.intermediate' using BinStorage(); -- after this load, the fields are treated as bytearrays though -- they are actually "typed", test that the implicit casts -- introduced by the operations in the foreach below will work fine d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2; store d into ':OUTPATH:';\, 'notmq' => 1, }, { # check that a cast of a value of type # same as the result type of the cast works # when the value is treated as a bytearray 'num' => 2, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); b = foreach a generate name, age, gpa; store b into ':OUTPATH:.intermediate' using BinStorage(); c = load ':OUTPATH:.intermediate' using BinStorage(); -- after this load, the fields are treated as bytearrays though -- they are actually "typed", test that the implicit casts -- introduced by the operations in the foreach below will work fine d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f; store d into ':OUTPATH:';\, 'notmq' => 1, }, { #check that a cast of a value of type #same as the result type of the cast works #when the value is treated as a bytearray 'num' => 3, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); b = group a by name; c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2']; -- store the bag, tuple and map store c into ':OUTPATH:.intermediate' using BinStorage(); d = load ':OUTPATH:.intermediate' using BinStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]); -- after this load, the fields are treated as bytearrays though -- they are actually "typed", test that the implicit casts -- introduced by the operations in the foreach below will work fine e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2'; store e into ':OUTPATH:';\, 'notmq' => 1, }, { # check that a cast of a value of type # same as the result type of the cast works # when the value is treated as a bytearray 'num' => 4, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); b = foreach a generate name, age, gpa; store b into ':OUTPATH:.intermediate' using PigStorage(); c = load ':OUTPATH:.intermediate' using PigStorage(); -- after this load, the fields are treated as bytearrays though -- they are actually "typed", test that the implicit casts -- introduced by the operations in the foreach below will work fine d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2; store d into ':OUTPATH:';\, 'notmq' => 1, }, { # check that a cast of a value of type # same as the result type of the cast works # when the value is treated as a bytearray 'num' => 5, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); b = foreach a generate name, age, gpa; store b into ':OUTPATH:.intermediate' using PigStorage(); c = load ':OUTPATH:.intermediate' using PigStorage(); -- after this load, the fields are treated as bytearrays though -- they are actually "typed", test that the implicit casts -- introduced by the operations in the foreach below will work fine d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f; store d into ':OUTPATH:';\, 'notmq' => 1, }, { #check that a cast of a value of type #same as the result type of the cast works #when the value is treated as a bytearray 'num' => 6, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); b = group a by name; c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2']; -- store the bag, tuple and map store c into ':OUTPATH:.intermediate' using PigStorage(); d = load ':OUTPATH:.intermediate' using PigStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]); -- after this load, the fields are treated as bytearrays though -- they are actually "typed", test that the implicit casts -- introduced by the operations in the foreach below will work fine e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2'; store e into ':OUTPATH:';\, 'notmq' => 1, }, { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name, age, gpa, instate); b = foreach a generate (boolean)instate; c = filter b by instate == true; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); b = foreach a generate instate; c = filter b by instate == 'true'; store c into ':OUTPATH:';\, } ], }, { 'name' => 'ClassResolution', 'tests' => [ { # check that Loader specified without a package # name works if that package name is specified # in udf.import.list 'num' => 1, 'floatpostprocess' => 1, 'delimiter' => ' ', 'java_params' => ['-Dudf.import.list=org.apache.pig.test.udf.storefunc'], 'pig' => q\ register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); b = foreach a generate CONCAT('(', name), CONCAT((chararray)age, ' )'); store b into ':OUTPATH:.intermediate' using PigStorage(','); c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.DumpLoader(); store c into ':OUTPATH:';\, 'notmq' => 1, }, ], }, { 'name' => 'MergeJoin', 'tests' => [ # Simplest merge-join. { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2'; g = join e by $0, f by $0 using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; g = join a by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with left-side filter { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1'; h = filter e by $1 > 30; f = load ':OUTPATH:.intermediate2'; g = join h by $0, f by $0 using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; h = filter a by $1 > 30; g = join h by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with right-side filter { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2'; i = filter f by $2 != 'democrat'; g = join e by $0, i by $0 using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; i = filter b by $2 != 'democrat'; g = join a by $0, i by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with schemas { 'num' => 4, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float); f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float); g = join e by $0, f by $0 using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; g = join a by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with key as expression { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0,$1; d = order b by $0,$1; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2'; g = join e by ($0,$1), f by ($0,$1) using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; g = join a by ($0,$1), b by ($0,$1); store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with key as expression This expression guarantees ordering { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $1; d = order b by $1; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2'; g = join e by ($1+10), f by ($1+10) using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; g = join a by ($1+10), b by ($1+10) ; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with nulls in keys and data. { 'num' => 7, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = load ':INPATH:/singlefile/voternulltab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2'; g = join e by $0, f by $0 using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = load ':INPATH:/singlefile/voternulltab10k'; g = join a by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with one file across multiple blocks { 'num' => 8, 'execonly' => 'mapred', # since this join will run out of memory in local mode 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\a = load ':INPATH:/singlefile/votertab10k'; b = load ':INPATH:/singlefile/studenttab20m'; h = filter b by $2 < 1.5; c = order a by $0; d = order h by $0 parallel 1; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, reg:chararray, contrib:float); f = load ':OUTPATH:.intermediate2'as (name:chararray, age:int, gpa:float); g = join e by $0, f by $0 using 'merge'; i = filter g by $2 == 'democrat' and $1 > 76; store i into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k'; b = load ':INPATH:/singlefile/studenttab20m'; h = filter b by $2 < 1.5; g = join a by $0, h by $0; i = filter g by $2 == 'democrat' and $1 > 76; store i into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-join with join on numeric key { 'num' => 9, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); c = order a by age; d = order b by age; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2'; exec; e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float); f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float); g = join e by age, f by age using 'merge'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); g = join a by age, b by age; store g into ':OUTPATH:';\, 'notmq' => 1, }, ] }, { 'name' => 'SkewedJoin', 'floatpostprocess' => 1, 'delimiter' => ' ', 'tests' => [ { 'num' => 1, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name, b by name using 'skewed' parallel 8; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name, b by name; store e into ':OUTPATH:';\, }, # basic join with no skewed keys { 'num' => 2, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=10000'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name, b by name using 'skewed'; store e into ':OUTPATH:';\, 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name, b by name ; store e into ':OUTPATH:';\, }, # join after filtering { 'num' => 3, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by $0, d by $0 using 'skewed' parallel 8; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by $0, d by $0 ; store e into ':OUTPATH:';\, }, # join by two columns { 'num' => 4, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by (name, age), d by (name, age) using 'skewed' parallel 8; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by (name, age), d by (name, age) ; store e into ':OUTPATH:';\, }, # join with add { 'num' => 5, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=50'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by age+10, d by age + 20 using 'skewed' parallel 10; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by age+10, d by age + 20 ; store e into ':OUTPATH:';\, }, # join with split { 'num' => 6, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = filter a by $1 > 25; c = join a by $0, b by $0 using 'skewed' parallel 7; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = filter a by $1 > 25; c = join a by $0, b by $0 ; store c into ':OUTPATH:';\, }, # join with UDF { 'num' => 7, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=20'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by SIZE(name), d by SIZE(name) using 'skewed' parallel 7; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; e = join c by SIZE(name), d by SIZE(name) ; store e into ':OUTPATH:';\, }, # left outer join { 'num' => 8, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name left outer, b by name using 'skewed' parallel 8; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name left outer, b by name ; store e into ':OUTPATH:';\, }, # right outer join { 'num' => 9, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name right outer, b by name using 'skewed' parallel 8; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name right outer, b by name ; store e into ':OUTPATH:';\, }, # full outer join { 'num' => 10, 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name full outer, b by name using 'skewed' parallel 8; store e into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = join a by name full outer, b by name ; store e into ':OUTPATH:';\, }, ] }, { 'name' => 'CollectedGroup', 'tests' => [ # Simplest collected group. { 'num' => 1, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = order a by $0; store b into ':OUTPATH:.intermediate'; exec; register :FUNCPATH:/testudf.jar; c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader(); d = group c by $0 using 'collected'; e = foreach d generate group, COUNT(c); store e into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; d = group a by $0 ; e = foreach d generate group, COUNT(a); store e into ':OUTPATH:';\, }, # Collected group with filter { 'num' => 2, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = order a by $0; store b into ':OUTPATH:.intermediate'; exec; register :FUNCPATH:/testudf.jar; c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader(); d = filter c by $1 > 30; e = group d by $0 using 'collected'; f = foreach e generate group, COUNT(d); store f into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; d = filter a by $1 > 30; e = group d by $0 ; f = foreach e generate group, COUNT(d); store f into ':OUTPATH:';\, }, # Collected group with schemas { 'num' => 3, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = order a by $0; store b into ':OUTPATH:.intermediate'; exec; register :FUNCPATH:/testudf.jar; c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); d = group c by $0 using 'collected'; e = foreach d generate group, MAX(c.age); store e into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); d = group a by $0 ; e = foreach d generate group, MAX(a.$1); store e into ':OUTPATH:';\, }, # Collected group with multiple columns { 'num' => 4, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = order a by name, age; store b into ':OUTPATH:.intermediate'; exec; register :FUNCPATH:/testudf.jar; c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); d = group c by (name, age) using 'collected'; e = foreach d generate group.name, group.age, MIN(c.gpa); store e into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); d = group a by (name, age) ; e = foreach d generate group.name, group.age, MIN(a.gpa); store e into ':OUTPATH:';\, }, # Collected group with nulls in keys and data. { 'num' => 5, 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = order a by $0; store b into ':OUTPATH:.intermediate'; exec; register :FUNCPATH:/testudf.jar; c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); d = group c by $0 using 'collected'; e = foreach d generate group, SUM(c.$1); store e into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); d = group a by $0 ; e = foreach d generate group, SUM(a.$1); store e into ':OUTPATH:';\, }, # Collected group with numeric key { 'num' => 6, 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = order a by age; store b into ':OUTPATH:.intermediate'; exec; register :FUNCPATH:/testudf.jar; c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); d = group c by age using 'collected'; e = foreach d generate group, AVG(c.gpa), COUNT(c.name); store e into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); d = group a by age ; e = foreach d generate group, AVG(a.gpa), COUNT(a.name); store e into ':OUTPATH:';\, }, ] }, { 'name' => 'SecondarySort', 'tests' => [ { # simple order by 'num' => 1, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by age parallel 10; c = foreach b { d = order a by name; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d); }; store c into ':OUTPATH:';\, }, { # order by desc 'num' => 2, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by age parallel 10; c = foreach b { d = order a by name desc; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d); }; store c into ':OUTPATH:';\, }, { # order by float type 'num' => 3, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = order a by gpa; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa); }; store c into ':OUTPATH:';\, }, # order by string type { 'num' => 4, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = order a by name; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name); }; store c into ':OUTPATH:';\, }, # simple distinct { 'num' => 5, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = a.name; e = distinct d; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(e); }; store c into ':OUTPATH:';\, }, # distinct on tuple { 'num' => 6, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = distinct a; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d); }; store c into ':OUTPATH:';\, }, # sort by two columns { 'num' => 7, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = order a by gpa, name desc; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa), org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name); }; store c into ':OUTPATH:';\, }, # sort, distinct mix { 'num' => 8, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = order a by name; e = d.gpa; f = distinct e; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f); }; store c into ':OUTPATH:';\, }, # sort, distinct mix { 'num' => 9, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = group a by age parallel 10; c = foreach b { d = order a by gpa; e = d.gpa; f = distinct e; generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f); }; store c into ':OUTPATH:';\, }, { # secondary sort boolean 'num' => 10, 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); b = group a by age; c = foreach b { d = order a by instate; generate group, flatten(d); }; store c into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); b = group a by age; c = foreach b { d = order a by instate; generate group, flatten(d); }; store c into ':OUTPATH:';\, } ] }, { 'name' => 'Accumulator', 'tests' => [ { 'num' => 1, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); e = cogroup a by name, b by name parallel 8; f = foreach e generate group, SUM(a.age) as s; g = filter f by s>0; store g into ':OUTPATH:';\, }, { 'num' => 2, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); e = group a by name parallel 8; f = foreach e generate group, COUNT(a), MAX(a.contributions), MIN(a.contributions) ; store f into ':OUTPATH:';\, }, { 'num' => 3, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); e = group a by name parallel 8; f = foreach e generate group, (MAX(a.contributions)-MIN(a.contributions))*COUNT(a) ; store f into ':OUTPATH:';\, }, { 'num' => 4, 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); e = group a by name parallel 8; f = foreach e { g = distinct a.age; generate group, COUNT(g);} store f into ':OUTPATH:';\, } ] }, { 'name' => 'PruneColumns', 'tests' => [ { 'num' => 1, 'execonly' => 'mapred', # studenttab20m not available in local mode 'pig' => q\ a = load ':INPATH:/singlefile/studenttab20m' using PigStorage() as (name, age, gpa); b = foreach a generate age; store b into ':OUTPATH:';\, }, { # jython udf which returns an array 'num' => 12, 'pig' => q\ register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray); b = foreach a generate CONCAT(CONCAT(age, ' '), gpa) as sentence; c = foreach b generate flatten(myfuncs.tokenize(sentence)); store c into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray); b = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray); c = foreach a generate age; d = foreach b generate gpa; e = union c, d; store e into ':OUTPATH:';\, } ] }, { 'name' => 'Bzip', 'tests' => [ { # test reading and writing out files with .bz2 extension 'num' => 1, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); store a into ':OUTPATH:.intermediate.bz2'; b = load ':OUTPATH:.intermediate.bz2'; store b into ':OUTPATH:';\, 'notmq' => 1, }, { # test reading and writing with .bz extension 'num' => 2, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); store a into ':OUTPATH:.intermediate.bz'; b = load ':OUTPATH:.intermediate.bz'; store b into ':OUTPATH:';\, 'notmq' => 1, }, ] }, { 'name' => 'Scalar', 'tests' => [ { # test scalar in foreach (most common) 'num' => 1, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = group a all; c = foreach b generate AVG(a.gpa) as avg, MAX(a.gpa) as max; y = foreach a generate name, (gpa - c.avg) / c.max; store y into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { # test scalar in filter 'num' => 2, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = group a all; c = foreach b generate AVG(a.gpa) as avg; y = filter a by gpa > c.avg; store y into ':OUTPATH:';\, }, { # test scalar with two branch 'num' => 3, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = group a all; c = foreach b generate AVG(a.age) as avg; x = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions); y = filter x by age > c.avg; store y into ':OUTPATH:';\, }, { # test with scalar from two inputs 'num' => 4, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = group a all; c = foreach b generate AVG(a.age) as avg; d = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions); e = group d all; f = foreach e generate AVG(d.age) as avg; y = foreach a generate age/c.avg, age/f.avg; store y into ':OUTPATH:';\, }, ] }, { 'name' => 'Scripting', 'tests' => [ { # test common 'num' => 1, 'pig' => q\ register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate myfuncs.square(age); store b into ':OUTPATH:';\, }, { # test common 'num' => 2, 'pig' => q\ register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = foreach a generate (chararray)name; c = distinct b; d = foreach c generate myfuncs.concat(name) as name; e = order d by name; store e into ':OUTPATH:';\, 'sortArgs' => ['-t', ' '], }, { # test that functions with same names resolve correctly across name spaces 'num' => 10, 'pig' => q\ register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); b = foreach a generate name, myfuncs.adjustgpa(gpa, instate); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); b = foreach a generate name, (instate=='true'?gpa:gpa+1); store b into ':OUTPATH:';\, }, { # test that functions with same names resolve correctly across name spaces 'num' => 11, 'ignore' => 1, # PIG-2596 'pig' => q\ register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = foreach a generate name, myfuncs.isretired(age); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = foreach a generate name, (age>=60?1:0); store b into ':OUTPATH:';\, } ] }, { 'name' => 'RubyUDFs', 'tests' => [ { # test integer square 'num' => 1, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate myfuncs.square(age); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate age * age; store b into ':OUTPATH:';\, }, { # test string concat and referencing function without a namespace 'num' => 2, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); b = foreach a generate myfuncs.concat(name, name); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); b = foreach a generate CONCAT(name, name); store b into ':OUTPATH:';\, }, { # test long and double square, plus two references to the same UDF with different schemas 'num' => 3, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double); b = foreach a generate myfuncs.square(age), myfuncs.square(gpa); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate age * age, gpa * gpa; store b into ':OUTPATH:';\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { # test method with no schema decorator (ie, returns bytearray) 'num' => 4, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = foreach a generate myfuncs.byteconcat(name, name); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = foreach a generate CONCAT(name, name); store b into ':OUTPATH:';\, }, { # test method with complex types 'num' => 5, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb); c = foreach b generate mm#'name', mt.$0, mb.$0; store c into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); b = foreach a generate SIZE(m#'name'), t.$2, b.$2; store b into ':OUTPATH:';\, }, { # test null input and output 'num' => 6, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate myfuncs.square(age); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate age * age; store b into ':OUTPATH:';\, }, { # test functions that call other functions and include other files 'num' => 7, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate myfuncs.redirect(age); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate age * age; store b into ':OUTPATH:';\, }, { # test that functions with same names resolve correctly across name spaces 'num' => 8, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as morefuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate myfuncs.square(age), morefuncs.cube(age), morefuncs.CUBE(age); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate age * age, age * age * age, age * age * age; store b into ':OUTPATH:';\, }, { # test algebraic functions 'num' => 9, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = group a by name; c = foreach b generate group, myfuncs.Count(a); store c into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = group a by name; c = foreach b generate group, COUNT(a); store c into ':OUTPATH:';\, }, { # test accumulator functions 'num' => 10, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = group a by name; c = foreach b generate group, myfuncs.Sum(a.age), myfuncs.Sum(a.gpa); d = foreach c generate $0, $1, (double)((int)$2*100)/100; store d into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = group a by name; c = foreach b generate group, SUM(a.age), SUM(a.gpa); d = foreach c generate $0, $1, (double)((int)$2*100)/100; store d into ':OUTPATH:';\, }, { 'num' => 11, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate flatten(myfuncs.reverse(name, age)); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach a generate age, name; store b into ':OUTPATH:';\, }, { 'num' => 12, 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = filter a by myfuncs.ISEVEN(age); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = filter a by age%2==0; store b into ':OUTPATH:';\, }, { 'num' => 13, 'java_params' => ['-Dpig.accumulative.batchsize=5'], 'pig' => q\ register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach (group a all) generate FLATTEN(myfuncs.AppendIndex(a)); store b into ':OUTPATH:';\, 'verify_pig_script' => q\ register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); b = foreach (group a all) generate FLATTEN(org.apache.pig.test.udf.evalfunc.AppendIndex(a)); store b into ':OUTPATH:';\, }, ] }, { 'name' => 'Native', 'tests' => [ { # test common 'num' => 1, 'pig' => q\ rmf table_testNativeMRJobSimple_input rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; store b into ':OUTPATH:';\, 'notmq' => 1, }, { # test complex 'num' => 2, 'pig' => q\ rmf table_testNativeMRJobSimple_input rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = foreach a generate name; c = distinct b; d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; e = order d by name; store e into ':OUTPATH:';\, 'sortArgs' => ['-t', ' '], 'notmq' => 1, }, { # test streaming 'num' => 3, 'pig' => q\ rmf table_testNativeMRJobSimple_input rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`; store b into ':OUTPATH:';\, 'pig23' => q\ rmf table_testNativeMRJobSimple_input rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = mapreduce ':MAPREDJARS:/hadoop-0.23.0-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`; store b into ':OUTPATH:';\, 'notmq' => 1, }, ] }, { 'name' => 'Partitioner', 'tests' => [ { # test group 'num' => 1, 'execonly' => 'mapred', # since this join will run out of memory in local mode 'pig' => q\register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa); b = group a by age PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner2 parallel 2; c = foreach b generate group, COUNT(a); store c into ':OUTPATH:';\, }, ] }, { #################################################################### # SUB : CastScalar # FEATURE: adds functionality that allows to cast elements of a single-tuple relation into a scalar value. # JIRA: Pig-1434 # # TEST ITEMS: # 1 Test syntax # 2 Test scalar for simple data type # 3 Test scalar for complex data type: tuple, bag, map # 4 Test implicit cast # 5 Test explicit cast # 6 Positional parameter # 7 Cast within an aggregate function # 8 Cast within an UDF function # 9 Cast with a FOREACH # 10 Cast with a FILTER # 11 Cast with a SPLIT # 12 Cast in a JOIN # 13 Multiquery # 14 Cast on a schema that cannot be inferred should result in bytearray # 15 Replicated Join # 16 Test operations such as R1 * (int)R1 # 17 CheckSingular(*) # 18 missing field in scalar file # 19 scalar referenced from an empty file # 20 empty input directory # 21 Single row vs Multiple Row # 22 Cast on a multi-field tuple # 23 Reference a non-scalar as a scalar # 24 Test multiple loaders 'name' => 'CastScalar', 'tests' => [ { # 2 Test scalar for simple data type # 3 Test scalar for complex data type: tuple, bag, map # 9 Cast with a FOREACH #INPATH = /user/hadoopqa/pig/tests/data 'num' => 1, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); b = group a all; c = foreach b generate SUM(a.age) as total; d = foreach a generate name, age+(double)c.total as d_sum; e = order d by name, d_sum; store d into ':OUTPATH:'; #, # 6 Positional parameter }, { 'num' => 2, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); b = group a all; c = foreach b generate SUM(a.age) as total; d = foreach a generate name, age+(double)c.$0 as d_sum; e = order d by name, d_sum; store d into ':OUTPATH:'; #, # 2 Test scalar for simple data type # 3 Test scalar for complex data type:map # 9 Cast with a FOREACH # 13 Multiquery # 24 Test multiple loaders #INPATH = /user/hadoopqa/pig/tests/data }, { # 4 Test implicit cast # 10 Cast with a FILTER # # I set the benchmark to use "19" because pig trunkates during cast and sql rounds up. 'num' => 7, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.gpa)+20 as avg_gpa; d = order c by avg_gpa; simple_scalar = limit d 1; f = filter a by age < (int) simple_scalar.avg_gpa; g = order f by name, age, gpa; store g into ':OUTPATH:';\, }, { # 5 Test explicit cast # 10 Cast with a FILTER 'num' => 8, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.age) AS average; d = order c by average; simple_scalar = limit d 1; d = filter a by age > (int) simple_scalar.average; e = foreach d generate name, age; store e into ':OUTPATH:'; \, }, { # 5 Test explicit cast # 6 Positional parameter 'num' => 9, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.age) AS average; d = order c by average; simple_scalar = limit d 1; d = filter a by age > (int) simple_scalar.$1; e = foreach d generate name, age; store e into ':OUTPATH:'; \, }, { # 4 Test implicit cast # 6 Positional parameter # 10 Cast with a FILTER 'num' => 10, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.age) AS average; d = order c by average; simple_scalar = limit d 1; d = filter a by age > simple_scalar.$1; e = foreach d generate name, age; store e into ':OUTPATH:'; \, }, { # 4 Test implicit cast # 6 Positional parameter # 11 Cast with a SPLIT 'num' => 11, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.age) AS average; d = order c by average; simple_scalar = limit d 1; split a into X1 if age > (int) simple_scalar.$1, X2 if age < 20; split a into X3 if age > (int) simple_scalar.$1, X4 if age > 70; store X1 into ':OUTPATH:.1'; store X2 into ':OUTPATH:.2'; store X3 into ':OUTPATH:.3'; store X4 into ':OUTPATH:.4'; \, }, { # 4 Test implicit cast # 6 Positional parameter # 12 Cast with a JOIN 'num' => 12, 'pig' => q\ a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); c = filter a by age < 20; d = filter b by age < 20; simple_scalar = limit d 1; e = join c by name, d by name; f= filter e by c::age <(int)simple_scalar.age; store f into ':OUTPATH:';\, }, ] },{ 'name' => 'udf_TOBAGandTOTUPLE', 'sortResults' => 1, 'floatpostprocess' => 1, 'delimiter' => ' ', 'tests' => [ { # TEST : resulting schema for TOBAG/TOTUPLE with simple types # TEST : resulting schema for TOBAG/TOTUPLE with positional parameters # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE and standard projections # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE using AS clause 'num' => 1 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; Gen1 = FOREACH B GENERATE $0, $1, $2 ; GroupById = GROUP B BY id; B1 = foreach B generate TOBAG( intnum1000, id, intnum5); B3 = foreach B generate TOBAG( $0, $1, $2); T1= foreach B generate TOTUPLE( intnum1000, id, intnum5); T2= foreach B generate TOTUPLE( $0, $1, $2); T3 = foreach B generate TOTUPLE( $0, $0, $0); T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7; T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7; T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate; describe Gen1; describe GroupById; describe B1; describe B3; describe T1; describe T2; describe T3; describe T4; describe T5; describe T6; ? ,'expected_out_regex' => 'B1: {{int}}' ,'expected_out_regex' => 'B3: {{int}}' ,'expected_out_regex' => 'T1: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}' ,'expected_out_regex' => 'T2: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}' ,'expected_out_regex' => 'T3: {org.apache.pig.builtin.totuple_intnum1000.*: (intnum1000: int,intnum1000: int,intnum1000: int)}' ,'expected_out_regex' => 'T4: {{int},org.apache.pig.builtin.totuple_intnum100.*: (intnum100: int,intnum: int,longnum: long),floatnum: float,doublenum: double}' ,'expected_out_regex' => 'T5: {intnum1000: int,id: int,org.apache.pig.builtin.totuple_intnum100.*: (intnum5: int,intnum100: int,intnum: int).*{NULL}.*doublenum: double}' ,'expected_out_regex' => "T6: {intnum1000: int,org.apache.pig.builtin.totuple_intnum1000.*: \\(intnum1000: int,intnum1000: int,intnum1000: int\\),{\\(int\\)},duplicate: int}" }, { # TEST : bag of mixed data types # TEST : Order # TEST : positional parameters 'num' => 2 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); C = foreach A generate TOBAG( id, floatnum, doublenum ); D = foreach A generate TOBAG( id, intnum); E = foreach A generate TOBAG( (float) id,floatnum ); F = foreach A generate TOBAG( (long) id,longnum ); G = foreach A generate TOBAG( (double) id,doublenum ); describe C; describe D; describe E; describe F; describe G; ? ,'expected_out_regex' => 'C: {{\\(NULL\\)}}' ,'expected_out_regex' => 'D: {{\\(int\\)}}' ,'expected_out_regex' => 'E: {{\\(float\\)}}' ,'expected_out_regex' => 'F: {{\\(long\\)}}' ,'expected_out_regex' => 'G: {{\\(double\\)}}' }, { # TEST : TOBAG/TOTUPLE with simple types # TEST : TOBAG/TOTUPLE with positional parameters # TEST : various projects using a combination of TOBAG/TOTUPLE and standard projections # TEST : various projects using a combination of TOBAG/TOTUPLE using AS clause 'num' => 3 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; B1 = foreach B generate TOBAG( intnum1000, id, intnum5); B2 = foreach B generate TOBAG( $0, $1, $2); T1= foreach B generate TOTUPLE( intnum1000, id, intnum5); T2= foreach B generate TOTUPLE( $0, $1, $2); T3 = foreach B generate TOTUPLE( $0, $0, $0); T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7; T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7; T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate; Gen1 = FOREACH B GENERATE $0, $1, $2 ; GroupById = GROUP B BY id; store Gen1 into ':OUTPATH:.1'; store GroupById into ':OUTPATH:.2'; store B1 into ':OUTPATH:.3'; store B2 into ':OUTPATH:.4'; store T1 into ':OUTPATH:.5'; store T2 into ':OUTPATH:.6'; store T3 into ':OUTPATH:.7'; store T4 into ':OUTPATH:.8'; ? }, { # TEST : cast for TOTUPLE/TOBAG 'num' => 4 ,'ignore' => 1 # different error message for different version of hadoop ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B= limit A 10; C = foreach B generate $0, TOTUPLE((int) $0, (long) $0, (double) $0), TOBAG( (float) $0, (chararray) $0), $0; store C into ':OUTPATH:'; ? ,'expected_err_regex' => 'ERROR 1108: Duplicate schema alias' ,'rc' => 6 }, { # TEST : cast for TOTUPLE/TOBAG 'num' => 5 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B= limit A 1; C = foreach B generate $0, TOTUPLE((int) $0); D = foreach B generate $0, TOTUPLE((long) $0); E = foreach B generate $0, TOTUPLE((double) $0); F = foreach B generate $0, TOTUPLE((float) $0); G = foreach B generate $0, TOTUPLE((chararray) $0); store B into ':OUTPATH:.1'; store C into ':OUTPATH:.2'; store D into ':OUTPATH:.3'; store E into ':OUTPATH:.4'; store F into ':OUTPATH:.5'; store G into ':OUTPATH:.6'; ? }, { #TEST more complicated nested functions such as TOTUPLE(TOBAG()) #TEST more complicated nested functions such as TOBAG(TOTUPLE()) #TEST more complicated nested functions such as TOTUPLE(TOTUPLE()) #TEST more complicated nested functions such as TOBAG(TOBAG()) 'num' => 6 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; tint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) ); bint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); binb = foreach B generate TOBAG( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); tinb = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); store B into ':OUTPATH:.1'; store tint into ':OUTPATH:.2'; store bint into ':OUTPATH:.3'; store binb into ':OUTPATH:.4'; store tinb into ':OUTPATH:.5'; ? }, { #TEST arithmetic operation in TOTUPLE and TOBAG #TEST aggregate funcion - NOT IMPLEMENTED #TEST tuple with 50+ items #TEST with null 'num' => 7 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; B1= foreach B generate TOTUPLE( $1, $2, $3); T1= foreach B generate TOTUPLE( $1, $2, $3); R1= foreach B generate TOTUPLE( $1, $0+1, $0+2, $0+3),TOBAG($0+4, $0+1 ); R2= foreach B generate TOTUPLE( $0, $1, $2, $3, $4, $5, $6, $7, (int) 8, (int) 9 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $6, $7 , (int) 29, (int) 30, $0, $1, $2, $3, $4, $5, $6, $7, (int) 39, (int) 40 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $5, $7 ); R3= foreach B generate $0, TOTUPLE(0,0,0), TOBAG( 0, 0 ); R4= foreach B generate $0, TOTUPLE(null, id, null), TOBAG( id, null, id,null ); describe R1; describe R2; describe R3; describe R4; store B into ':OUTPATH:.1'; store B1 into ':OUTPATH:.2'; store R1 into ':OUTPATH:.3'; store R2 into ':OUTPATH:.4'; store R3 into ':OUTPATH:.5'; store R4 into ':OUTPATH:.6'; ? }, { # TEST more TOTUPLE and TOBAG nested combinations 'num' => 8 ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; C = foreach B generate TOBAG( $0, $1, $2); T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) ); T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); T3= foreach B generate TOBAG( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7)); store B into ':OUTPATH:.1'; store C into ':OUTPATH:.2'; store T1 into ':OUTPATH:.3'; store T2 into ':OUTPATH:.4'; store T3 into ':OUTPATH:.5'; ? ,'verify_pig_script' => q?register :FUNCPATH:/testudf.jar; A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; C = foreach B generate TOBAG( $0, $1, $2); T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) ); T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); T3= foreach B generate org.apache.pig.test.udf.evalfunc.TOBAG2( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7)); store B into ':OUTPATH:.1'; store C into ':OUTPATH:.2'; store T1 into ':OUTPATH:.3'; store T2 into ':OUTPATH:.4'; store T3 into ':OUTPATH:.5'; ? }, { #TEST negative test case: out of bounds positional parameter # EVERYTHING IS CORRECT 'num' => 9 ,'ignore' => 1 # different error message for different version of hadoop ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; C = foreach B generate $0, $1, TOTUPLE($2, $998, $4), TOBAG($5, $6), $7; ? ,'expected_err_regex' => 'Out of bound access.*non-existent column: 998' }, { #TEST negative test case: out of bounds positional parameter # EVERYTHING IS CORRECT 'num' => 10 ,'ignore' => 1 # different error message for different version of hadoop ,'pig' => q? A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); B = limit A 10; C = foreach B generate $0, $1, TOBAG($5, $999), $7; ? ,'expected_err_regex' => 'Out of bound access.*non-existent column: 999' }, ] # end of tests },{ 'name' => 'ToStuffSyntaxSugar', 'tests' => [ { #TEST TOTUPLE syntax sugar 'num' => 1, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate (name, age); store B into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate TOTUPLE(name, age); store B into ':OUTPATH:';\, }, { #TEST TOBAG syntax sugar 'num' => 2, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate {name, age}; store B into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate TOBAG(name, age); store B into ':OUTPATH:';\, }, { #TEST TOMAP syntax sugar 'num' => 3, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate [name, age]; store B into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate TOMAP(name, age); store B into ':OUTPATH:';\, }, { #TEST verify single element inside parenthesis does NOT call TOTUPLE 'num' => 4, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate (age) + 1; store B into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = foreach A generate (age + 1); store B into ':OUTPATH:';\, } ] # end of tests },{ 'name' => 'MergeOperator', 'tests' => [ { # Test Union using merge where schema is identical | A&B have identical schema 'num' => 1, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); C = union onschema A, B; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); C = union A, B; store C into ':OUTPATH:';\, },{ # Test Union using merge with type promotions, int->long and float->double 'num' => 2, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double); C = union onschema A, B; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); C = union A, B; D = foreach C generate name, (long)age, (double)gpa; store C into ':OUTPATH:';\, },{ # Test Union using merge with type promotions, int->float 'num' => 3, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float); C = union onschema A, B; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float); C = union A, B; D = foreach C generate name, (float)age, gpa; store C into ':OUTPATH:';\, },{ # Test Union using merge with type promotions, int->double 'num' => 4, 'floatpostprocess' => 1, 'delimiter' => ' ', 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float); C = union onschema A, B; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float); B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float); C = union A, B; D = foreach C generate name, (double)age, gpa; store C into ':OUTPATH:';\, },{ # Test Union of an intersection 'num' => 5, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float); C = union onschema A, B; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ register :FUNCPATH:/testudf.jar; define Nil org.apache.pig.test.udf.evalfunc.Nil(); A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float); C = foreach A generate name, age, (chararray)gpa, Nil(), Nil(); D = foreach B generate name, age, Nil(), registration, (chararray)contributions; E = union C, D; store E into ':OUTPATH:';\, }, { # Test Union where the intersection is null 'num' => 6, 'pig' => q\ A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/textdoc' as (line:chararray); C = union onschema A, B; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ register :FUNCPATH:/testudf.jar; define Nil org.apache.pig.test.udf.evalfunc.Nil(); A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); B = load ':INPATH:/singlefile/textdoc' as (line:chararray); C = foreach A generate name, (chararray)age, (chararray)gpa, Nil(name); D = foreach B generate Nil(line), Nil(line), Nil(line), line; E = union C, D; store E into ':OUTPATH:';\, }, { # Test Union using merge where schema is identical | A&B have identical schema 'num' => 7, 'pig' => q\ a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); C = union onschema a, b; store C into ':OUTPATH:';\, 'verify_pig_script' => q\ a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); C = union a, b; store C into ':OUTPATH:';\, } ] }, { # Test Union using merge with Simple data types 'name' => 'UdfDistributedCache', 'tests' => [ { 'num' => 1, 'execonly' => 'mapred', # since distributed cache is not supported in local mode 'pig' => q? register :FUNCPATH:/testudf.jar; define udfdc org.apache.pig.test.udf.evalfunc.Udfcachetest(':INPATH:/singlefile/votertab10k#foodle'); a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate udfdc(age); STORE c into ':OUTPATH:';?, 'verify_pig_script' => q? a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate 'tom van buren', 68, 'socialist', 390.19; STORE c into ':OUTPATH:';?, }, ] }, { 'name' => 'MonitoredUDF', 'tests' => [ { 'num' => 1, 'ignore23' => 'guava version of Pig is higher than hadoop 23', 'pig' => q?register :FUNCPATH:/testudf.jar; define gm org.apache.pig.test.udf.evalfunc.GoodMonitored(); a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate gm(name); store b into ':OUTPATH:';?, 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate 'fred'; store b into ':OUTPATH:';?, },{ 'num' => 2, 'pig' => q?register :FUNCPATH:/testudf.jar; define bad org.apache.pig.test.udf.evalfunc.BadMonitored(); a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate bad(name); store b into ':OUTPATH:';?, 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate ''; store b into ':OUTPATH:';?, },{ 'num' => 3, 'pig' => q?register :FUNCPATH:/testudf.jar; define bad org.apache.pig.test.udf.evalfunc.BadMonitored(); a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate bad(name); store b into ':OUTPATH:';?, 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate 'barney'; store b into ':OUTPATH:';?, } ], },{ 'name' => 'MergeSparseJoin', 'tests' => [ # Simplest merge-sparse-join. { 'num' => 1, 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); g = join e by $0, f by $0 using 'merge-sparse'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; g = join a by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-sparse-join with left-side filter { 'num' => 2, 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); exec; e = load ':OUTPATH:.intermediate1'; h = filter e by $1 > 30; f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); g = join h by $0, f by $0 using 'merge-sparse'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; h = filter a by $1 > 30; g = join h by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-sparse-join with right-side filter { 'num' => 3, 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); i = filter f by $2 != 'democrat'; g = join e by $0, i by $0 using 'merge-sparse'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; i = filter b by $2 != 'democrat'; g = join a by $0, i by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-sparse-join with key as expression { 'num' => 4, 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; c = order a by $0,$1; d = order b by $0,$1; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1'); exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1'); g = join e by ($0,$1), f by ($0,$1) using 'merge-sparse'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; g = join a by ($0,$1), b by ($0,$1); store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-sparse-join with nulls in keys and data. { 'num' => 5, 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar a = load ':INPATH:/singlefile/studentnulltab10k'; b = load ':INPATH:/singlefile/voternulltab10k'; c = order a by $0; d = order b by $0; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); exec; e = load ':OUTPATH:.intermediate1'; f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); g = join e by $0, f by $0 using 'merge-sparse'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; b = load ':INPATH:/singlefile/voternulltab10k'; g = join a by $0, b by $0; store g into ':OUTPATH:';\, 'notmq' => 1, }, # Merge-sparse-join with join on numeric key { 'num' => 6, 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); c = order a by age; d = order b by age; store c into ':OUTPATH:.intermediate1'; store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); exec; e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float); f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0') as (name:chararray, age:int, reg:chararray, contrib:float); g = join e by age, f by age using 'merge-sparse'; store g into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); g = join a by age, b by age; store g into ':OUTPATH:';\, 'notmq' => 1, } ], },{ 'name' => 'BugFix', 'tests' => [ { # PIG-2286 'num' => 1, 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double, gpa:double); B = group A all; C = foreach B generate group, COR(A.age, A.gpa); store C into ':OUTPATH:';?, 'verify_pig_script' => q?set pig.exec.nocombiner true A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double); B = group A all; C = foreach B generate group, COR(A.age, A.gpa); store C into ':OUTPATH:';?, }, { # PIG-2286, with 3 inputs to COR 'num' => 2, 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double); B = foreach A generate age, gpa, gpa*gpa as gpa2; C = group B all; D = foreach C generate group, COR(B.age, B.gpa, B.gpa2); store D into ':OUTPATH:';?, 'verify_pig_script' => q?set pig.exec.nocombiner true A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double); B = foreach A generate age, gpa, gpa*gpa as gpa2; C = group B all; D = foreach C generate group, COR(B.age, B.gpa, B.gpa2); store D into ':OUTPATH:';?, }, { # PIG-2385 'num' => 3, 'pig_params' => ['-M'], 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); Z = group A all; Z1 = foreach Z generate AVG(A.gpa) as avg; B = foreach A generate name, age, gpa-Z1.avg as diff; STORE B INTO ':OUTPATH:.1'; C = DISTINCT B ; store C into ':OUTPATH:.2';?, 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); Z = group A all; Z1 = foreach Z generate AVG(A.gpa) as avg; B = cross A, Z1; B1 = foreach B generate name, age, gpa-Z1.avg as diff; STORE B1 INTO ':OUTPATH:.1'; C = DISTINCT B1 ; store C into ':OUTPATH:.2';?, }, { # PIG-2576 'num' => 4, 'execonly' => 'mapred', 'pig' => q?register :FUNCPATH:/testudf.jar; define printconf org.apache.pig.test.udf.evalfunc.UdfContextFrontend('dummy'); a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = limit a 1; c = foreach b generate printconf(name); store c into ':OUTPATH:'; fs -ls; ?, 'rc' => 0, 'not_expected_out_regex' => "checkJobConf: conf is null: false", 'expected_out_regex' => "checkJobConf: conf is null: true", } ], },{ 'name' => 'Bloom', 'execonly' => 'mapred', # distributed cache does not work in local mode 'tests' => [ { 'num' => 1, 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3'); A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); B = filter A by name == 'alice allen'; C = group B all; D = foreach C generate bb(B.name); store D into ':HDFSTMP:/mybloom_1'; exec; define bloom Bloom(':HDFSTMP:/mybloom_1'); E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); F = filter E by bloom(name); store F into ':OUTPATH:';", 'notmq' => 1, 'verify_pig_script' => " A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); B = filter A by name == 'alice allen'; store B into ':OUTPATH:';", }, { 'num' => 2, 'pig' => "define bb BuildBloom('Hash.MURMUR_HASH', 'fixed', '128', '3'); A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); B = filter A by name == 'alice allen'; C = group B all; D = foreach C generate bb(B.name); store D into ':HDFSTMP:/mybloom_2'; exec; define bloom Bloom(':HDFSTMP:/mybloom_2'); E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); F = filter E by bloom(name); G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); H = join F by name, G by name; store H into ':OUTPATH:';", 'notmq' => 1, 'verify_pig_script' => " A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); B = filter A by name == 'alice allen'; C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); D = join B by name, C by name; store D into ':OUTPATH:';", },{ 'num' => 3, 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', '1', '0.0001'); A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); B = filter A by name == 'alice allen'; C = group B all; D = foreach C generate bb(B.name); store D into ':HDFSTMP:/mybloom_3'; exec; define bloom Bloom(':HDFSTMP:/mybloom_3'); E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); F = filter E by bloom(name); G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); H = join G by name, F by name using 'repl'; store H into ':OUTPATH:';", 'notmq' => 1, 'verify_pig_script' => " A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); B = filter A by name == 'alice allen'; C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); D = join C by name, B by name; store D into ':OUTPATH:';", } ], },{ 'name' => 'JsonLoaderStorage', 'tests' => [ { 'num' => 1, 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); store A into ':OUTPATH:.intermediate' using JsonStorage(); exec A = LOAD ':OUTPATH:.intermediate' using JsonLoader(); store A into ':OUTPATH:';?, 'notmq' => 1, 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); store A into ':OUTPATH:';?, }, { 'num' => 2, 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); store A into ':OUTPATH:.intermediate1' using JsonStorage(); B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double); store B into ':OUTPATH:.intermediate2' using JsonStorage(); exec A = LOAD ':OUTPATH:.intermediate1' using JsonLoader(); B = LOAD ':OUTPATH:.intermediate2' using JsonLoader(); C = JOIN A by name, B by name; store C into ':OUTPATH:';?, 'notmq' => 1, 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double); C = JOIN A by name, B by name; store C into ':OUTPATH:';?, }, { 'num' => 3, 'ignore' => 1, # PIG-2594 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); store a into ':OUTPATH:.intermediate' using JsonStorage(); exec B = LOAD ':OUTPATH:.intermediate' using JsonLoader(); store B into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); store a into ':OUTPATH:';\, } ], },{ 'name' => 'STRSPLIT', 'tests' => [ { # See PIG-2311 'num' => 1, 'pig' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0); b= filter a by NOT (a0 is null); c= foreach b generate STRSPLIT(a0); store c into ':OUTPATH:';?, 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0); b= filter a by NOT (a0 is null); b= foreach b generate (chararray)a0 as a0 ; c= foreach b generate STRSPLIT(a0); store c into ':OUTPATH:';?, } ], }, { 'name' => 'Realias', 'tests' => [ { 'num' => 1, 'pig' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k'; B = A; store B into ':OUTPATH:';\, 'verify_pig_script' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k'; store A into ':OUTPATH:';\, } ] }, { 'name' => 'NestedForEach', 'tests' => [ { 'num' => 1, 'pig' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = group A by name; C = foreach B { C1 = foreach A generate UPPER(name), age+1 as age, gpa; generate C1; } D = foreach C generate flatten(C1); store D into ':OUTPATH:';\, 'verify_pig_script' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = foreach A generate UPPER(name), age+1, gpa; store B into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);; B = group A by name; C = foreach B { C1 = A.age; C2 = filter C1 by age>=30; C3 = foreach C2 generate age+1 as age; C4 = order C3 by age desc; generate C4; } D = foreach C generate flatten(C4); store D into ':OUTPATH:';\, 'verify_pig_script' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = filter A by age>=30; C = foreach B generate age+1 as age; D = order C by age desc; store D into ':OUTPATH:';\, } ] }, { 'name' => 'NestedCross', 'tests' => [ { 'num' => 1, 'pig' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); C = cogroup A by name, B by name; D = foreach C { C1 = cross A, B; generate flatten(C1); } store D into ':OUTPATH:';\, 'verify_pig_script' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); C = JOIN A by name, B by name; store C into ':OUTPATH:';\, }, { 'num' => 2, 'pig' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); C = cogroup A by name, B by name; D = foreach C { C1 = filter A by gpa > 4; C2 = filter B by contributions > 500; C3 = cross C1, C2; C4 = foreach C3 generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions); generate flatten(C4); } store D into ':OUTPATH:';\, 'verify_pig_script' => q\ A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); C = filter A by gpa > 4; D = filter B by contributions > 500; E = JOIN C by name, D by name; F = foreach E generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions); store F into ':OUTPATH:';\, }, ] } ], }, ;