############################################################################### # Nightly tests for pig. # # Author: Alan F. Gates (gates@) # $Header:$ # #use Yahoo::Miners::Test::PigSetup; #PigSetup::setup(); #my $me = `whoami`; #chomp $me; $cfg = { 'driver' => 'Pig', 'groups' => [ { 'name' => 'Macro_DefinitionAndInline', 'tests' => [ { # simple macro, no args 'num' => 1, 'pig' => q#define simple_macro() returns void { a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate age, name; store b into ':OUTPATH:'; } simple_macro();#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate age, name; store b into ':OUTPATH:';#, },{ # input args, no return 'num' => 2, 'pig' => q#define simple_macro(loadfile) returns void { a = load '$loadfile' as (name, age, gpa); b = foreach a generate age, name; store b into ':OUTPATH:'; } simple_macro(':INPATH:/singlefile/studenttab10k');#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate age, name; store b into ':OUTPATH:';#, },{ # input args, return value 'num' => 3, 'pig' => q#define simple_macro(loadfile) returns b { a = load '$loadfile' as (name, age, gpa); $b = foreach a generate age, name; } x = simple_macro(':INPATH:/singlefile/studenttab10k'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = foreach a generate age, name; store b into ':OUTPATH:';#, }, { # input args, filter on double and int, return value 'num' => 4, 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $max_age; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, }, { #Definition multiple input, no output, multiple returns value #x = with multiple input, no output, multiple returns value #Query based on FilterEq from nightly.conf 'num' => 5, 'pig' => q\define test (in1, in2) returns r1, r2 { a = load '$in1' using PigStorage() as (name, age, gpa); $r1 = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob'); b = load '$in2' using PigStorage() as (name:chararray, age:int, registration, contributions:double); $r2 = filter b by name matches 'f.ed' and (chararray)registration matches 'd.m'; } x1, x2 = test(':INPATH:/singlefile/studenttab10k', ':INPATH:/singlefile/votertab10k'); store x1 into ':OUTPATH:.1' using PigStorage; store x2 into ':OUTPATH:.2' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); a1 = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob'); store a1 into ':OUTPATH:.1' using PigStorage; b = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); b2 = filter b by name matches 'f.ed' and (chararray)registration matches 'd.m'; store b2 into ':OUTPATH:.2' using PigStorage;\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { # use positional parameters inside macro 'num' => 6, 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns c { b = filter $in_relation by $2 >= $min_gpa and $1 <= $max_age; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, }, { # Test nested macros 'num' => 7, 'pig' => q\define sum_it(in, relation, scol) returns d { $d = foreach $in generate group, SUM($relation.$scol); } define group_it(in_relation, group_key, sum_col) returns c { b = group $in_relation by $group_key ; $c = sum_it(b, $in_relation, $sum_col); } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = group_it(a, 'name', 'age'); store x into ':OUTPATH:';\, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, SUM(a.age); store c into ':OUTPATH:';#, }, { # single macro definition invoked multiple times 'num' => 8, 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $max_age; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:.1'; z = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); y = simple_macro(a, '2.0', '50'); store y into ':OUTPATH:.2';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:.1'; d = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); e = filter d by gpa >= 2.0 and age <= 50; f = foreach e generate age, name; store f into ':OUTPATH:.2';#, }, { # macro arg used as function arg 'num' => 9, 'pig' => q#define simple_macro(loadfile, sep) returns b { a = load '$loadfile' using PigStorage('$sep') as (name, age, gpa); $b = foreach a generate age, name; } x = simple_macro(':INPATH:/singlefile/studentcolon10k', ':'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); b = foreach a generate age, name; store b into ':OUTPATH:';#, }, { # Multiple returns via split in the data flow 'num' => 10, 'pig' => q\define test (in1) returns r1, r2 { a = load '$in1' using PigStorage() as (name, age, gpa); $r1 = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob'); $r2 = filter a by name > 'fred'; } x1, x2 = test(':INPATH:/singlefile/studenttab10k'); store x1 into ':OUTPATH:.1' using PigStorage; store x2 into ':OUTPATH:.2' using PigStorage;\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); a1 = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob'); store a1 into ':OUTPATH:.1' using PigStorage; a2 = filter a by name > 'fred'; store a2 into ':OUTPATH:.2' using PigStorage;\, 'floatpostprocess' => 1, 'delimiter' => ' ', }, { # parameter substitution at the top level 'num' => 11, 'pig_params' => ['-p', qq(loadfile='singlefile/studenttab10k')], 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $max_age; $c = foreach b generate age, name; } a = load ':INPATH:/$loadfile' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, } ] }, { 'name' => 'Macro_Scope', 'tests' => [ { # re-use of variable in macro and global scope 'num' => 1, 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns b { $b = filter $in_relation by $2 >= $min_gpa and $1 <= $max_age; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); b = foreach x generate age, name; store b into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, }, { #Definition where there is a name collision between parameters in parent/child macro 'num' => 2, 'pig' => q\define sum_it(in_relation, relation, sum_col) returns c { b = foreach $in_relation generate group, SUM($relation.$sum_col); $c = order b by $1; } define group_it(in_relation, group_key, sum_col) returns c { b = group $in_relation by $group_key ; $c = sum_it(b, $in_relation, $sum_col); } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = group_it(a, name, age); store x into ':OUTPATH:';\, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, SUM(a.age); store c into ':OUTPATH:';#, }, { #Definition where there is a name collision between macro and returns value in main pig script 'num' => 3, 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $max_age; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); c = simple_macro(a, '3.0', '40'); store c into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, } ] }, { 'name' => 'Macro_Schema', 'tests' => [ { # macro that does not change the schema 'num' => 1, 'pig' => q\define test(in) returns a { $a = filter $in by age > 30; } a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = test(a); describe b;\, 'rc' => 0, 'expected_out_regex'=> "b: {name: chararray,age: int,gpa: double}" }, { # macro that does change the schema 'num' => 2, 'pig' => q\define test(in) returns a { $a = foreach $in generate name; } a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = test(a); describe b;\, 'rc' => 0, 'expected_out_regex'=> "b: {name: chararray}" } ] }, { 'name' => 'Macro_Misc', 'tests' => [ { #Comments in macro 'num' => 1, 'pig' => q#define simple_macro(in_relation, min_gpa, max_age) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $max_age; $c = foreach b generate age, name; -- add a comment } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, }, { #register 'num' => 2, 'pig' => q\define test (in) returns b { $b = foreach $in generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred; } register :FUNCPATH:/testudf.jar; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = test(a); store x into ':OUTPATH:';\, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); register :FUNCPATH:/testudf.jar; b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred; store b into ':OUTPATH:';#, }, { #define for streaming combines with define for macros 'num' => 3, ,'pig' => q#define CMD `perl -ne 'print $_;'`; define test(in) returns B { $B = stream $in through CMD as (name, age, gpa); } A = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = test(A); store x into ':OUTPATH:';#, 'verify_pig_script' => q#A = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); define CMD `perl -ne 'print $_;'`; B = stream A through CMD as (name, age, gpa); store B into ':OUTPATH:';#, 'floatpostprocess' => 1, 'delimiter' => ' ' } ] }, { 'name' => 'Macro_Import', 'tests' => [ { 'num' => 1, 'pig' => q#import ':SCRIPTHOMEPATH:/macro1.pig'; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'verify_pig_script' => q#a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = filter a by gpa >= 3.0 and age <= 40; c = foreach b generate age, name; store c into ':OUTPATH:';#, } ] }, { 'name' => 'Macro_Error', 'tests' => [ { # parameter names repeated 'num' => 1, 'ignore' => 'https://issues.apache.org/jira/browse/PIG-2247', 'pig' => q#define simple_macro(in_relation, min_gpa, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Multiple arguments min_gpa found" }, { # undefined parameter in macro 'num' => 2, 'pig' => q#define simple_macro(in_relation, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $max_age; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Macro inline failed for macro 'simple_macro'. Reason: Undefined parameter : max_age" }, { # name collision between arg and return value 'num' => 3, 'pig' => q#define simple_macro(in_relation, min_gpa, c) returns c { b = filter $in_relation by gpa >= $min_gpa and age <= $c; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Multiple values found for c" }, { # keyword as macro name 'num' => 4, 'pig' => q#define foreach(in_relation, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "mismatched input 'foreach' expecting IDENTIFIER_L" }, { # UDF as macro name 'num' => 5, 'ignore' => 'https://issues.apache.org/jira/browse/PIG-2248', 'pig' => q#define COUNT(in_relation, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = COUNT(a, '3.0'); store x into ':OUTPATH:';#, 'expected_err_regex' => "macro name hides UDF COUNT" }, { # redefine a macro 'num' => 6, 'pig' => q#define simple_macro(in_relation, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } define simple_macro(in, min_age) returns d { b = filter $in by age >= $min_age; $d = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Duplicated macro name 'simple_macro'" }, { # invoke non-existent macro 'num' => 7, 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = nosuch_macro('a', '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Cannot expand macro 'nosuch_macro'. Reason: Macro must be defined before expansion." }, { # Specifies two returns, but only actually returns one 'num' => 8, 'pig' => q#define simple(in_relation, min_gpa) returns c,d { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x, y = simple(a, '3.0'); store x into ':OUTPATH:.1'; store y into ':OUTPATH:.2';#, 'expected_err_regex' => "Invalid macro definition: . Reason: Macro 'simple' missing return alias: d" }, { # syntax error in a macro, check for correct line number 'num' => 9, 'pig' => q#define simple(in_relation, min_gpa) returns c { b = fiter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple(a, '3.0'); store x into ':OUTPATH:';#, 'expected_err_regex' => "line 2" }, { # too many args passed to macro 'num' => 10, 'pig' => q#define simple_macro(in_relation, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Failed to expand macro 'simple_macro'. Reason: Expected number of parameters: 2 actual number of inputs: 3" }, { # return two values, but script only accepts 1 'num' => 11, 'pig' => q#define simple(in_relation, min_gpa) returns c,d { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; $d = foreach b generate name, age; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple(a, '3.0'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Failed to expand macro 'simple'. Reason: Expected number of return aliases: 2 actual number of return values: 1" }, { # return 1 value, but script expects 2 'num' => 12, 'pig' => q#define simple(in_relation, min_gpa) returns c { b = filter $in_relation by gpa >= $min_gpa; $c = foreach b generate age, name; } a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x, y = simple(a, '3.0'); store x into ':OUTPATH:.1'; store y into ':OUTPATH:.2';#, 'expected_err_regex' => "Failed to expand macro 'simple'. Reason: Expected number of return aliases: 1 actual number of return values: 2" } ] }, { 'name' => 'Macro_Import_Error', 'tests' => [ { # import non-existent file 'num' => 1, 'ignore' => 1, # different error message for different version of hadoop 'pig' => q#import 'nosuchfile'; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro('a', '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Failed to import file 'nosuchfile'. Reason: Can't find the Specified file nosuchfile" }, { # import a macro with a syntax error 'num' => 2, 'pig' => q#import ':SCRIPTHOMEPATH:/macro_bad1.pig'; a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); x = simple_macro(a, '3.0', '40'); store x into ':OUTPATH:';#, 'expected_err_regex' => "Invalid macro definition" } ] } ], }, ; # import non-existent file, import script with error