/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Parser file for Pig Parser * * NOTE: THIS FILE IS THE BASE FOR A FEW TREE PARSER FILES, such as AstValidator.g, * SO IF YOU CHANGE THIS FILE, YOU WILL PROBABLY NEED TO MAKE CORRESPONDING CHANGES TO * THOSE FILES AS WELL. */ parser grammar QueryParser; options { tokenVocab=QueryLexer; output=AST; backtrack=false; // greatly slows down parsing! } tokens { QUERY; STATEMENT; FUNC; FUNC_REF; FUNC_EVAL; INVOKE; INVOKER_FUNC_EVAL; IN_LHS; IN_RHS; CASE_COND; CASE_EXPR; CASE_EXPR_LHS; CASE_EXPR_RHS; CAST_EXPR; COL_RANGE; BIN_EXPR; TUPLE_VAL; MAP_VAL; BAG_VAL; KEY_VAL_PAIR; FIELD_DEF; FIELD_DEF_WITHOUT_IDENTIFIER; NESTED_CMD_ASSI; NESTED_CMD; NESTED_PROJ; SPLIT_BRANCH; FOREACH_PLAN_SIMPLE; FOREACH_PLAN_COMPLEX; MAP_TYPE; TUPLE_TYPE; BAG_TYPE; NEG; EXPR_IN_PAREN; JOIN_ITEM; TUPLE_TYPE_CAST; BAG_TYPE_CAST; PARAMS; RETURN_VAL; MACRO_DEF; MACRO_BODY; MACRO_INLINE; NULL; TRUE; FALSE; IDENTIFIER; ANY; TOBAG; TOMAP; TOTUPLE; FAT_ARROW; } @header { package org.apache.pig.parser; import java.util.Set; import java.util.HashSet; import java.util.Collections; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.parser.PigMacro; import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.base.Joiner; } @members { private static Log log = LogFactory.getLog( QueryParser.class ); private Set memory = new HashSet(); // Make a deep copy of the given node private static Tree deepCopy(Tree tree) { Tree copy = tree.dupNode(); for (int i = 0; i < tree.getChildCount(); i++) { Tree child = deepCopy(tree.getChild(i)); child.setParent(copy); copy.addChild(child); } return copy; } @Override protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow) throws RecognitionException { throw new MismatchedTokenException( ttype, input ); } @Override public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow) throws RecognitionException { throw e; } @Override public String getErrorMessage(RecognitionException e, String[] tokenNames ) { if( !log.isDebugEnabled() ) { if( e instanceof NoViableAltException ) { return "Syntax error, unexpected symbol at or near " + getTokenErrorDisplay( e.token ); } else { return super.getErrorMessage( e, tokenNames ); } } List stack = getRuleInvocationStack( e, this.getClass().getName() ); String msg = null; if( e instanceof NoViableAltException ) { NoViableAltException nvae = (NoViableAltException)e; msg = " no viable alt; token = " + e.token + " (decision=" + nvae.decisionNumber + " state " + nvae.stateNumber + ")" + " decision=<<" + nvae.grammarDecisionDescription + ">>"; } else { msg = super.getErrorMessage( e, tokenNames ); } return stack + " " + msg; } @Override public String getTokenErrorDisplay(Token t) { return "'" + t.getText() + "'"; } @Override public String getErrorHeader(RecognitionException ex) { return QueryParserUtils.generateErrorHeader( ex, this.getSourceName() ); } private static final Map FUNC_TO_LITERAL = ImmutableMap.of( TOBAG, BAG_VAL, TOMAP, MAP_VAL, TOTUPLE, TUPLE_VAL); private static final Set BOOLEAN_TOKENS = ImmutableSet.of( STR_OP_EQ, STR_OP_NE, STR_OP_GT, STR_OP_LT, STR_OP_GTE, STR_OP_LTE, STR_OP_MATCHES, AND, OR, NOT, NULL, NUM_OP_EQ, NUM_OP_NE, NUM_OP_GT, NUM_OP_GTE, NUM_OP_LT, NUM_OP_LTE); private static final Set LITERAL_TOKENS = ImmutableSet.of( INTEGER, LONGINTEGER, FLOATNUMBER, DOUBLENUMBER, QUOTEDSTRING, NULL, TRUE, FALSE, MAP_VAL, BAG_VAL, TUPLE_VAL, PERIOD, POUND); } // End of @members @rulecatch { catch(RecognitionException re) { throw re; } } query : statement* EOF -> ^( QUERY statement* ) ; schema: field_def_list EOF ; // STATEMENTS statement : SEMI_COLON! | general_statement SEMI_COLON! | split_clause SEMI_COLON! | inline_clause SEMI_COLON! | import_clause SEMI_COLON! | realias_clause SEMI_COLON! | register_clause SEMI_COLON! | assert_clause SEMI_COLON! // semicolons after foreach_complex_statement are optional for backwards compatibility, but to keep // the grammar unambiguous if there is one then we'll parse it as a single, standalone semicolon // (which matches the first statement rule) | foreach_statement ; nested_op_clause : LEFT_PAREN! op_clause parallel_clause? RIGHT_PAREN! | LEFT_PAREN FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? ) ) RIGHT_PAREN -> ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? ; general_statement : FAT_ARROW ( ( op_clause parallel_clause? ) | nested_op_clause ) -> ^( STATEMENT IDENTIFIER["____RESERVED____"] op_clause? parallel_clause? nested_op_clause? ) | ( identifier_plus EQUAL )? ( ( op_clause parallel_clause? ) | nested_op_clause ) -> ^( STATEMENT identifier_plus? op_clause? parallel_clause? nested_op_clause? ) ; // Statement represented by a foreach operator with a nested block. Simple foreach statement // is covered by general_statement. // We need to handle foreach specifically because of the ending ';', which is not required // if there is a nested block. This is ugly, but it gets the job done. foreach_statement : FAT_ARROW FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? SEMI_COLON ) ) -> ^( STATEMENT IDENTIFIER["____RESERVED____"] ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? ) | ( identifier_plus EQUAL )? FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? SEMI_COLON ) ) -> ^( STATEMENT identifier_plus? ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? ) ; foreach_plan_complex : LEFT_CURLY nested_blk RIGHT_CURLY -> ^( FOREACH_PLAN_COMPLEX nested_blk ) ; foreach_plan_simple : GENERATE flatten_generated_item ( COMMA flatten_generated_item )* -> ^( FOREACH_PLAN_SIMPLE ^( GENERATE flatten_generated_item+ ) ) ; // MACRO grammar macro_content : LEFT_CURLY ( macro_content | ~(LEFT_CURLY | RIGHT_CURLY) )* RIGHT_CURLY ; macro_param_clause : LEFT_PAREN ( identifier_plus (COMMA identifier_plus)* )? RIGHT_PAREN -> ^(PARAMS identifier_plus*) ; macro_return_clause : RETURNS ((identifier_plus (COMMA identifier_plus)*) | VOID) -> ^(RETURN_VAL identifier_plus*) ; macro_body_clause : macro_content -> ^(MACRO_BODY { new PigParserNode(new CommonToken(1, $macro_content.text), this.getSourceName(), $macro_content.start) } ) ; macro_clause : macro_param_clause macro_return_clause macro_body_clause -> ^(MACRO_DEF macro_param_clause macro_return_clause macro_body_clause) ; inline_return_clause : identifier_plus EQUAL -> ^(RETURN_VAL identifier_plus) | identifier_plus (COMMA identifier_plus)+ EQUAL -> ^(RETURN_VAL identifier_plus+) | -> ^(RETURN_VAL) ; parameter : IDENTIFIER | INTEGER | DOUBLENUMBER | BIGDECIMALNUMBER | BIGINTEGERNUMBER | QUOTEDSTRING | DOLLARVAR ; inline_param_clause : LEFT_PAREN ( parameter (COMMA parameter)* )? RIGHT_PAREN -> ^(PARAMS parameter*) ; inline_clause : inline_return_clause identifier_plus inline_param_clause -> ^(MACRO_INLINE identifier_plus inline_return_clause inline_param_clause) ; // TYPES simple_type : BOOLEAN | INT | LONG | FLOAT | DOUBLE | DATETIME | BIGINTEGER | BIGDECIMAL | CHARARRAY | BYTEARRAY ; implicit_tuple_type : LEFT_PAREN field_def_list? RIGHT_PAREN -> ^( TUPLE_TYPE field_def_list? ) ; explicit_tuple_type : TUPLE! implicit_tuple_type ; explicit_tuple_type_cast : TUPLE LEFT_PAREN ( explicit_type_cast ( COMMA explicit_type_cast )* )? RIGHT_PAREN -> ^( TUPLE_TYPE_CAST explicit_type_cast* ) ; tuple_type : implicit_tuple_type | explicit_tuple_type ; implicit_bag_type : LEFT_CURLY NULL COLON tuple_type? RIGHT_CURLY -> ^( BAG_TYPE tuple_type? ) | LEFT_CURLY ( ( identifier_plus COLON )? tuple_type )? RIGHT_CURLY -> ^( BAG_TYPE identifier_plus? tuple_type? ) ; explicit_bag_type : BAG! implicit_bag_type ; explicit_bag_type_cast : BAG LEFT_CURLY explicit_tuple_type_cast? RIGHT_CURLY -> ^( BAG_TYPE_CAST explicit_tuple_type_cast? ) ; implicit_map_type : LEFT_BRACKET ( ( identifier_plus COLON )? type )? RIGHT_BRACKET -> ^( MAP_TYPE identifier_plus? type? ) ; explicit_map_type : MAP! implicit_map_type ; map_type : implicit_map_type | explicit_map_type ; explicit_type : simple_type | explicit_tuple_type | explicit_bag_type | explicit_map_type ; implicit_type : implicit_tuple_type | implicit_bag_type | implicit_map_type ; type : explicit_type | implicit_type ; explicit_type_cast : simple_type | explicit_map_type | explicit_tuple_type_cast | explicit_bag_type_cast ; // CLAUSES import_clause : IMPORT^ QUOTEDSTRING ; register_clause : REGISTER^ QUOTEDSTRING (USING identifier_plus AS identifier_plus)? ; define_clause : DEFINE^ IDENTIFIER ( cmd | func_clause | macro_clause) ; realias_clause : identifier_plus EQUAL identifier_plus -> ^(REALIAS identifier_plus identifier_plus) ; parallel_clause : PARALLEL^ INTEGER ; op_clause : define_clause | load_clause | group_clause | cube_clause | store_clause | filter_clause | distinct_clause | limit_clause | sample_clause | order_clause | rank_clause | cross_clause | join_clause | union_clause | stream_clause | mr_clause ; ship_clause : SHIP^ LEFT_PAREN! path_list? RIGHT_PAREN! ; path_list : QUOTEDSTRING ( COMMA QUOTEDSTRING )* -> QUOTEDSTRING+ ; cache_clause : CACHE^ LEFT_PAREN! path_list RIGHT_PAREN! ; input_clause : INPUT^ LEFT_PAREN! stream_cmd_list RIGHT_PAREN! ; output_clause : OUTPUT^ LEFT_PAREN! stream_cmd_list RIGHT_PAREN! ; error_clause : STDERROR^ LEFT_PAREN! ( QUOTEDSTRING ( LIMIT! INTEGER )? )? RIGHT_PAREN! ; load_clause : LOAD^ QUOTEDSTRING ( USING! func_clause )? as_clause? ; func_clause : func_name -> ^( FUNC_REF func_name ) | func_name LEFT_PAREN func_args? RIGHT_PAREN -> ^( FUNC func_name func_args? ) ; // needed for disambiguation when parsing expressions...see below func_name_without_columns : eid_without_columns ( ( PERIOD | DOLLAR ) eid )* ; func_name : eid ( ( PERIOD | DOLLAR ) eid )* ; func_args_string : QUOTEDSTRING | MULTILINE_QUOTEDSTRING ; func_args : func_args_string ( COMMA func_args_string )* -> func_args_string+ ; group_clause : ( GROUP | COGROUP )^ group_item_list ( USING! QUOTEDSTRING )? partition_clause? ; group_item_list : group_item ( COMMA group_item )* -> group_item+ ; group_item : rel ( join_group_by_clause | ALL | ANY ) ( INNER | OUTER )? ; // "AS" CLAUSES identifier_plus : IDENTIFIER | reserved_identifier_whitelist -> IDENTIFIER[$reserved_identifier_whitelist.text] ; explicit_field_def : identifier_plus ( COLON type )? -> ^( FIELD_DEF identifier_plus type? ) | explicit_type -> ^( FIELD_DEF_WITHOUT_IDENTIFIER explicit_type ) ; field_def : explicit_field_def | implicit_type -> ^( FIELD_DEF_WITHOUT_IDENTIFIER implicit_type ) ; field_def_list : field_def ( COMMA! field_def )* ; // we have two tuple types as implicit_tuple_types can be confused with parentheses around // a field_def - so to remove this ambiguity we'll decide brackets around a single field_def // type is *not* a tuple as_clause : AS^ ( explicit_field_def | ( LEFT_PAREN! field_def_list? RIGHT_PAREN! ) ) ; // OTHERS stream_cmd_list : stream_cmd ( COMMA stream_cmd )* -> stream_cmd+ ; stream_cmd : ( STDIN | STDOUT | QUOTEDSTRING )^ ( USING! func_clause )? ; cmd : EXECCOMMAND^ ( ship_clause | cache_clause | input_clause | output_clause | error_clause )* ; rel : identifier_plus | previous_rel | nested_op_clause ; previous_rel : ARROBA ; store_clause : STORE^ rel INTO! QUOTEDSTRING ( USING! func_clause )? ; assert_clause : ASSERT^ rel BY! cond ( COMMA! QUOTEDSTRING )? ; filter_clause : FILTER^ rel BY! cond ; stream_clause : STREAM^ rel THROUGH! ( EXECCOMMAND | identifier_plus ) as_clause? ; mr_clause : MAPREDUCE^ QUOTEDSTRING ( LEFT_PAREN! path_list RIGHT_PAREN! )? store_clause load_clause EXECCOMMAND? ; split_clause : SPLIT^ rel INTO! split_branch split_branches ; split_branch : identifier_plus IF cond -> ^( SPLIT_BRANCH identifier_plus cond ) ; split_otherwise : identifier_plus OTHERWISE ALL? -> ^( OTHERWISE identifier_plus ALL? ) ; split_branches : COMMA! split_branch split_branches? | COMMA! split_otherwise ; limit_clause : LIMIT^ rel expr ; sample_clause : SAMPLE^ rel expr ; rank_clause : RANK^ rel ( rank_by_statement )? ; rank_by_statement : BY^ rank_by_clause DENSE? ; rank_by_clause : STAR ( ASC | DESC )? | rank_list ; rank_list : rank_col ( COMMA rank_col )* -> rank_col+ ; rank_col : col_range ( ASC | DESC )? | col_ref ( ASC | DESC )? ; order_clause : ORDER^ rel BY! order_by_clause ( USING! func_clause )? ; order_by_clause : STAR ( ASC | DESC )? | order_col_list ; order_col_list : order_col ( COMMA order_col )* -> order_col+ ; order_col : col_range (ASC | DESC)? | col_ref ( ASC | DESC )? | LEFT_PAREN! col_ref ( ASC | DESC )? RIGHT_PAREN! ; distinct_clause : DISTINCT^ rel partition_clause? ; partition_clause : PARTITION^ BY! func_name ; rel_list : rel ( COMMA rel )* -> rel+ ; cross_clause : CROSS^ rel_list partition_clause? ; join_clause : JOIN^ join_sub_clause ( USING! join_type )? partition_clause? ; join_type : QUOTEDSTRING ; join_sub_clause : join_item ( ( ( LEFT | RIGHT | FULL ) OUTER? COMMA! join_item ) | ( ( COMMA! join_item )+ ) ) ; join_item : rel join_group_by_clause -> ^( JOIN_ITEM rel join_group_by_clause ) ; // this can either be a single arg or something like (a,b) - which is // indistinguishable from a tuple. We'll therefore parse a single argument // (which can be a tuple of several real_args) and expand it: join_group_by_clause @after { Tree by = (Tree) retval.getTree(); Tree realArg = by.getChild(0); if(realArg.getType() == TUPLE_VAL || (realArg.getType() == FUNC_EVAL && realArg.getChild(0).getType() == TOTUPLE)) { retval.tree = adaptor.create(by.getType(), by.getText()); for(int i = 0; i < realArg.getChildCount(); ++i) { if(realArg.getChild(i).getType()!=TOTUPLE) ((Tree)retval.tree).addChild(realArg.getChild(i)); } adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop); } } : BY^ real_arg ; union_clause : UNION^ ONSCHEMA? rel_list ; cube_clause : CUBE rel BY cube_rollup_list ( COMMA cube_rollup_list )* -> ^( CUBE rel ^( BY cube_rollup_list+ ) ) ; cube_rollup_list : ( CUBE | ROLLUP )^ LEFT_PAREN! real_arg ( COMMA! real_arg )* RIGHT_PAREN! ; flatten_clause : FLATTEN^ LEFT_PAREN! expr RIGHT_PAREN! ; // unlike loading and streaming, we want the as_clause (if present) in a different format (i.e. // we drop the AS token itself). generate_as_clause : AS! ( ( LEFT_PAREN! field_def_list RIGHT_PAREN! ) | explicit_field_def ) ; flatten_generated_item : flatten_clause generate_as_clause? | real_arg generate_as_clause? ; // EXPRESSIONS // conditional precedence is OR weakest, then AND, then NOT, then IS NOT NULL and the comparison operators equally // by design the boolean operator hierarchy is entirely below the expression hierarchy real_arg : expr | STAR | col_range ; cond : and_cond ( OR^ and_cond )* ; and_cond : not_cond ( AND^ not_cond )* ; not_cond : NOT^? unary_cond ; unary_cond @after { // Expressions in parentheses are a little tricky to match as // they could contain either "cond" rules or "expr" rules. If // they are "expr" rules then they're put under a BOOL_COND node // in the tree, but "cond" rules put no extra tokens in the tree. // As we're matching non-recursively we'll parse whatever's in the // brackets, and if the AST has a boolean expression at its root // then we'll assume we've just got a "cond" expression in // brackets, and otherwise we'll assume its an "expr" (and so // we'll have to strip off the BOOL_COND token the "cast_expr" // rule added) BaseTree tree = (BaseTree) retval.getTree(); if(tree.getType() == BOOL_COND && tree.getChild(0).getType() == EXPR_IN_PAREN && BOOLEAN_TOKENS.contains(tree.getChild(0).getChild(0).getType())) { retval.tree = tree.getChild(0).getChild(0); adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop); } // For IN expression, we clone the lhs expression (1st child of the // returned tree) and insert it before every rhs expression. For example, // // lhs IN (rhs1, rhs2, rhs3) // => // ^( IN lhs, rhs1, lhs, rhs2, lhs, rhs3 ) // // Note that lhs appears three times at index 0, 2 and 4. // // This is needed because in LogicalPlanGenerator.g, we translate this // tree to nested or expressions, and we need to construct a new // LogicalExpression object per rhs expression. if(tree.getType() == IN) { Tree lhs = tree.getChild(0); for(int i = 2; i < tree.getChildCount(); i = i + 2) { tree.insertChild(i, deepCopy(lhs)); } } } : exp1 = expr ( ( IS NOT? NULL -> ^( NULL $exp1 NOT? ) ) | ( IN LEFT_PAREN ( rhs_operand ( COMMA rhs_operand )* ) RIGHT_PAREN -> ^( IN ^( IN_LHS expr ) ^( IN_RHS rhs_operand )+ ) ) | ( rel_op exp2 = expr -> ^( rel_op $exp1 $exp2 ) ) | ( -> ^(BOOL_COND expr) ) ) ; rhs_operand : expr ; expr : multi_expr ( ( PLUS | MINUS )^ multi_expr )* ; multi_expr : cast_expr ( ( STAR | DIV | PERCENT )^ cast_expr )* ; func_name_suffix : ( ( DOLLAR | PERIOD ) eid )+ ; cast_expr @after { BaseTree tree = (BaseTree) retval.getTree(); // the parser does an initial optimisation step: it removes TOTUPLE / TOMAP / TOBAG // function calls if it knows they'll just return the input (i.e. because the function's // argument is a literal). We'll do this here by post-processing the result: if(tree.getType() == FUNC_EVAL) { Integer func = FUNC_TO_LITERAL.get(tree.getChild(0).getType()); if(func != null) { boolean canBeOptimised = true; for(int arg = 1; arg < tree.getChildCount() && canBeOptimised; ++arg) { canBeOptimised &= LITERAL_TOKENS.contains(tree.getChild(arg).getType()); } if(canBeOptimised) { retval.tree = adaptor.create(func, func.toString()); ((BaseTree)retval.tree).addChildren(tree.getChildren()); ((BaseTree)retval.tree).deleteChild(0); // the (e.g.) TOBAG token adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop); } } } // a minor correction to the token text for formatting - // we want NEG's text to be the same as MINUSes if(tree.getType() == NEG) { ((CommonTree)tree).token.setText("-"); } // As noted below, brackets around a single literal mean a tuple // of that literal, not a nested expression which evaluates to // that literal. Remember that a NULL with children is a boolean // expression, not a literal! if(tree.getType() == EXPR_IN_PAREN && LITERAL_TOKENS.contains(tree.getChild(0).getType()) && (tree.getChild(0).getType() != NULL || tree.getChild(0).getChildCount() == 0)) { ((CommonTree)tree).token.setType(TUPLE_VAL); } // For CASE statement, we clone the case expression (1st child of the // returned tree) and insert it before every when expression. For example, // // CASE e1 // WHEN e2 THEN e3 // WHEN e4 THEN e5 // ELSE e6 // END // => // ^( CASE e1, e2, e3, e1, e4, e5, e6 ) // // Note that e1 appears twice at index 0 and 3. // // This is needed because in LogicalPlanGenerator.g, we translate this // tree to nested bincond expressions, and we need to construct a new // LogicalExpression object per when branch. if(tree.getType() == CASE_EXPR) { Tree caseExpr = tree.getChild(0); int childCount = tree.getChildCount(); boolean hasElse = childCount \% 2 == 0; int whenBranchCount = ( childCount - (hasElse ? 2 : 1) ) / 2; for(int i = 1; i < whenBranchCount; i++) { tree.insertChild(3*i, deepCopy(caseExpr)); } } } : scalar | MINUS cast_expr -> ^( NEG cast_expr ) // single columns and functions (both of which can start with an identifier). Note that we have to be // careful with periods straight after the identifier, as we want those to be projections, not function // calls | col_ref_without_identifier projection* | invoker_func projection* | identifier_plus projection* | identifier_plus func_name_suffix? LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN projection* -> ^( FUNC_EVAL identifier_plus func_name_suffix? real_arg* ) projection* | func_name_without_columns LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN projection* -> ^( FUNC_EVAL func_name_without_columns real_arg* ) projection* | CASE ( (WHEN)=> WHEN cond THEN expr ( WHEN cond THEN expr )* ( ELSE expr )? END projection* -> ^( CASE_COND ^(WHEN cond+) ^(THEN expr+) ) projection* | expr WHEN rhs_operand THEN rhs_operand ( WHEN rhs_operand THEN rhs_operand )* ( ELSE rhs_operand )? END projection* -> ^( CASE_EXPR ^(CASE_EXPR_LHS expr) ^(CASE_EXPR_RHS rhs_operand)+ ) projection* ) | paren_expr | curly_expr | bracket_expr ; invoker_func @init { String staticStr = "true"; List packageStr = Lists.newArrayList(); String methodStr = null; } : INVOKE ( AMPERSAND | LEFT_PAREN real_arg { staticStr = "false"; } RIGHT_PAREN ) ( packageName=identifier_plus PERIOD { packageStr.add($packageName.text); } )* methodName=identifier_plus { methodStr=$methodName.text; } LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN -> ^( INVOKER_FUNC_EVAL IDENTIFIER[Joiner.on(".").join(packageStr)] IDENTIFIER[methodStr] IDENTIFIER[staticStr] real_arg* ) ; // now we have to deal with parentheses: in an expr, '(' can be the // start of a cast, the start of a nested expression or the start of // a tuple. We'll ensure parsing is unambiguous by assuming a single // expression in parentheses is a nested expression, whereas two or // more nested expressions are a tuple (unless that single expression // is a literal, in which case we assume tuple with a single element // - that literal). paren_expr @after { BaseTree tree = (BaseTree)retval.getTree(); // the other side of the @after block in unary_cond: if we've // matched an EXPR_IN_PAREN we expect the nested expression to // be an "expr", not a "cond", so we should strip off the // BOOL_COND token. if(tree.getType() == EXPR_IN_PAREN && tree.getChild(0).getType() == BOOL_COND) { int type = tree.getChild(0).getChild(0).getType(); // NULL is a special case - if it has children it's a boolean // expression, and if not it's a literal NULL. Note that we // replace *all* children if(!BOOLEAN_TOKENS.contains(type) || (type == NULL && tree.getChild(0).getChild(0).getChildCount() == 0)) { Tree addChildrenOf = tree.getChild(0); for(int i = 0; i < tree.getChildCount(); ++i) tree.deleteChild(i); for(int i = 0; i < addChildrenOf.getChildCount(); ++i) tree.addChild(addChildrenOf.getChild(i)); } } // A function call to TOTUPLE is inserted into the AST for // some tuple literals - but as we assume the first expression // after an open bracket is a "cond" rule, and as "cond" rules // nest "expr" rules under a BOOL_COND token we get an invalid // AST. We'll remove this BOOL_COND here: if(tree.getType() == FUNC_EVAL && tree.getChild(0).getType() == TOTUPLE && tree.getChildCount() > 1 && tree.getChild(1).getType() == BOOL_COND) { Tree insertChildrenOf = tree.getChild(1); tree.deleteChild(1); for(int i = insertChildrenOf.getChildCount() - 1; i >= 0; --i) tree.insertChild(1, insertChildrenOf.getChild(i)); } } : LEFT_PAREN! try_implicit_map_cast ; try_implicit_map_cast // we'll also allow implicit map casts (for backwards compatibility only - // bag and tuple casts have to be explicit and it makes the grammar more // simple). Unfortunately we'll have to turn on back-tracking for this rule, // as LEFT_PAREN LEFT_BRACKET could be a literal map in a EXPR_IN_PAREN. // It'd be much better if we could remove this from the Pig language (and // just rely on explicit map casts) - then we'd have no backtracking at all! : ( implicit_map_type RIGHT_PAREN cast_expr) => implicit_map_type RIGHT_PAREN cast_expr -> ^( CAST_EXPR implicit_map_type cast_expr ) | after_left_paren ; after_left_paren : explicit_type_cast RIGHT_PAREN cast_expr -> ^( CAST_EXPR explicit_type_cast cast_expr ) // tuples | RIGHT_PAREN projection* -> ^( TUPLE_VAL ) projection* | STAR ( COMMA real_arg )* RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE STAR real_arg* ) projection* | col_range ( COMMA real_arg )* RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE col_range real_arg* ) projection* // Tuples begin with '(' expr, but shorthand-booleans begin with '(' cond. As cond // and expr are indistinguishable, we'll parse as a cond (i.e. the most lenient) and // for exprs, strip off the BOOL_COND trees. You can have both nested conds and nested // exprs, so we'll just assume cond. | cond ( ( ( COMMA real_arg )+ RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE cond real_arg+ ) projection* ) | ( RIGHT_PAREN -> ^( EXPR_IN_PAREN cond ) ) | ( QMARK exp1 = expr COLON exp2 = expr RIGHT_PAREN -> ^( BIN_EXPR cond $exp1 $exp2 ) ) ) ; curly_expr : LEFT_CURLY real_arg ( COMMA real_arg )* RIGHT_CURLY projection* -> ^( FUNC_EVAL TOBAG real_arg+ ) projection* | LEFT_CURLY RIGHT_CURLY projection* -> ^( BAG_VAL ) projection* ; bracket_expr : LEFT_BRACKET real_arg ( COMMA real_arg )* RIGHT_BRACKET projection* -> ^( FUNC_EVAL TOMAP real_arg+ ) projection* | LEFT_BRACKET keyvalue ( COMMA keyvalue )* RIGHT_BRACKET projection* -> ^( MAP_VAL keyvalue+ ) projection* | LEFT_BRACKET RIGHT_BRACKET projection* -> ^( MAP_VAL ) projection* ; projection : PERIOD ( col_ref | LEFT_PAREN col_ref ( COMMA col_ref )* RIGHT_PAREN ) -> ^( PERIOD col_ref+ ) | POUND^ ( QUOTEDSTRING | NULL ) ; // ATOMS // for disambiguation with func_names col_ref_without_identifier : GROUP | DOLLARVAR ; col_ref : col_ref_without_identifier | identifier_plus ; col_range : c1 = col_ref DOUBLE_PERIOD c2 = col_ref? -> ^(COL_RANGE $c1 DOUBLE_PERIOD $c2?) | DOUBLE_PERIOD col_ref -> ^(COL_RANGE DOUBLE_PERIOD col_ref) ; scalar : INTEGER | LONGINTEGER | FLOATNUMBER | DOUBLENUMBER | BIGINTEGERNUMBER | BIGDECIMALNUMBER | QUOTEDSTRING | NULL | TRUE | FALSE ; keyvalue : QUOTEDSTRING POUND literal -> ^( KEY_VAL_PAIR QUOTEDSTRING literal ) ; literal_map : LEFT_BRACKET keyvalue ( COMMA keyvalue )* RIGHT_BRACKET -> ^( MAP_VAL keyvalue+ ) | LEFT_BRACKET RIGHT_BRACKET -> ^( MAP_VAL ) ; literal_bag : LEFT_CURLY literal_tuple ( COMMA literal_tuple )* RIGHT_CURLY -> ^( BAG_VAL literal_tuple+ ) | LEFT_CURLY RIGHT_CURLY -> ^( BAG_VAL ) ; literal_tuple : LEFT_PAREN literal ( COMMA literal )* RIGHT_PAREN -> ^( TUPLE_VAL literal+ ) | LEFT_PAREN RIGHT_PAREN -> ^( TUPLE_VAL ) ; literal : scalar | literal_map | literal_bag | literal_tuple ; // NESTING nested_blk : ( nested_command SEMI_COLON )* GENERATE flatten_generated_item ( COMMA flatten_generated_item )* SEMI_COLON -> nested_command* ^( GENERATE flatten_generated_item+ ) ; nested_command : ( identifier_plus EQUAL col_ref PERIOD col_ref_list { input.LA( 1 ) == SEMI_COLON }? ) => ( identifier_plus EQUAL nested_proj ) -> ^( NESTED_CMD identifier_plus nested_proj ) | identifier_plus EQUAL expr -> ^( NESTED_CMD_ASSI identifier_plus expr ) | identifier_plus EQUAL nested_op -> ^( NESTED_CMD identifier_plus nested_op ) ; nested_op : nested_filter | nested_sort | nested_distinct | nested_limit | nested_cross | nested_foreach ; nested_proj : col_ref PERIOD col_ref_list -> ^( NESTED_PROJ col_ref col_ref_list ) ; col_ref_list : ( col_ref | ( LEFT_PAREN col_ref ( COMMA col_ref )* RIGHT_PAREN ) ) -> col_ref+ ; nested_filter : FILTER^ nested_op_input BY! cond ; nested_sort : ORDER^ nested_op_input BY! order_by_clause ( USING! func_clause )? ; nested_distinct : DISTINCT^ nested_op_input ; nested_limit : LIMIT^ nested_op_input ( (INTEGER SEMI_COLON) => INTEGER | expr ) ; nested_cross : CROSS^ nested_op_input_list ; nested_foreach: FOREACH nested_op_input GENERATE flatten_generated_item ( COMMA flatten_generated_item )* -> ^( FOREACH nested_op_input ^( GENERATE flatten_generated_item+ ) ) ; nested_op_input : col_ref | nested_proj ; nested_op_input_list : nested_op_input ( COMMA nested_op_input )* -> nested_op_input+ ; // IDENTIFIERS // extended identifier, handling the keyword and identifier conflicts. Ugly but there is no other choice. eid_without_columns : rel_str_op | IMPORT | REGISTER | RETURNS | DEFINE | LOAD | FILTER | FOREACH | ROLLUP | ORDER | DISTINCT | COGROUP | JOIN | CROSS | UNION | SPLIT | INTO | IF | ALL | AS | BY | USING | INNER | OUTER | PARALLEL | PARTITION | AND | OR | GENERATE | ASC | DESC | BOOL | BIGINTEGER | BIGDECIMAL | DATETIME | CHARARRAY | BYTEARRAY | IS | STREAM | THROUGH | STORE | MAPREDUCE | SHIP | CACHE | INPUT | OUTPUT | STDERROR | STDIN | STDOUT | LIMIT | SAMPLE | LEFT | RIGHT | FULL | REALIAS | BOOL_COND | ASSERT ; eid : eid_without_columns | IDENTIFIER | GROUP | CUBE | TRUE | FALSE | INT | LONG | FLOAT | DOUBLE | NULL | NOT | FLATTEN | BAG | TUPLE | MAP ; // relational operator rel_op : rel_str_op | NUM_OP_EQ | NUM_OP_NE | NUM_OP_GT | NUM_OP_GTE | NUM_OP_LT | NUM_OP_LTE ; rel_str_op : STR_OP_EQ | STR_OP_NE | STR_OP_GT | STR_OP_LT | STR_OP_GTE | STR_OP_LTE | STR_OP_MATCHES ; reserved_identifier_whitelist : RANK | CUBE | IN | WHEN | THEN | ELSE | END ;