/** * RDQL for ARQ * http://jena.hpl.hp.com/~afs/BRQL/BRQL-description.html * * Author: Andy Seaborne andy.seaborne@hp.com * Date: June 2004 * * (c) Copyright 2001, 2002, 2003, 2004, 2005 Hewlett-Packard Development Company, LP * All rights reserved. * See end of file for details. * Derived from RDQL grammar for Jena. * * Constraint expression is derived from Java : * example java1.2-a.jj grammer in JavaCC distribution */ options { // Unicode characters outside of 0-0x00FF must be entered as \u1234 // Javacc does not provide mixed width charactser streams. JAVA_UNICODE_ESCAPE = true; UNICODE_INPUT = false ; STATIC = false ; // DEBUG_PARSER = true ; // DEBUG_TOKEN_MANAGER = true ; // JJTree options MULTI = true ; // NODE_DEFAULT_VOID = false ; // BUILD_NODE_FILES = false ; NODE_PREFIX = "Q_" ; NODE_PACKAGE = "com.hp.hpl.jena.query.parser.rdql" ; NODE_USES_PARSER = false ; } PARSER_BEGIN(RDQLParser) /* * (c) Copyright 2001, 2002, 2003, 2004 Hewlett-Packard Development Company, LP */ package com.hp.hpl.jena.query.parser.rdql ; public class RDQLParser { public SimpleNode top() { return (SimpleNode)jjtree.rootNode() ; } } PARSER_END(RDQLParser) /* WHITE SPACE */ SKIP : { " " | "\t" | "\n" | "\r" | "\f" } // Need this because we may not be at the start of the identifier // when the mode is entered. For URIs, we enter that mode because // a suitable < is seen, similarly we exit on > so no skip necessary SKIP : { " " | "\t" | "\n" | "\r" | "\f" } /* COMMENTS */ MORE : { "//" : IN_SINGLE_LINE_COMMENT | "#" : IN_SINGLE_LINE_COMMENT | "/*" : IN_MULTI_LINE_COMMENT } SPECIAL_TOKEN : { : DEFAULT } SPECIAL_TOKEN : { : DEFAULT } MORE : { < ~[] > } /* LITERALS */ TOKEN : { < INTEGER_LITERAL: (["l","L"])? | (["l","L"])? // | (["l","L"])? > | // If octal permitted, should be ["1"-"9"] (["0"-"9"])* < #DECIMAL_LITERAL: > | < #HEX_LITERAL: "0" ["x","X"] (["0"-"9","a"-"f","A"-"F"])+ > | < FLOATING_POINT_LITERAL: (["0"-"9"])+ "." (["0"-"9"])* ()? (["f","F","d","D"])? | "." (["0"-"9"])+ ()? (["f","F","d","D"])? | (["0"-"9"])+ (["f","F","d","D"])? | (["0"-"9"])+ ()? ["f","F","d","D"] > | < #EXPONENT: ["e","E"] (["+","-"])? (["0"-"9"])+ > | /**** * No character literals * And we allow single quoted strings */ < STRING_LITERAL1: "'" ( (~["'","\\","\n","\r"]) | ("\\" ~["\n","\r"]) )* "'" > | < STRING_LITERAL2: "\"" ( (~["\"","\\","\n","\r"]) | ("\\" ~["\n","\r"]) )* "\"" > } // Modes to read things that might be keywords as well. TOKEN : { ", "\t"] )+ > } TOKEN : { ":" > | | } /* To go ... TOKEN : { ||"_"|"$"|".")+ > | < LETTER: (["a"-"z"] | ["A"-"Z"])> | < DIGIT: ["0"-"9"]> } */ /* Keywords : includes operators that are words and should be * before general things like IDENTIFIER which swallow almost anything */ TOKEN [IGNORE_CASE] : { < SELECT: "select" > | < SOURCE: "source" > | < FROM: "from" > | < WHERE: "where" > | < SUCHTHAT: "and" > | < PREFIXES: "using" > | < OPTIONAL: "optional" > | < FOR: "for" > | < STR_EQ: "eq" > | < STR_NE: "ne" > } TOKEN : { < BOOLEAN_LITERAL: "true" | "false" > | < NULL_LITERAL: "null"> | < DIGITS: (["0"-"9"])+ > } // Notes: // XML 1.1 http://www.w3.org/TR/xml11/ // XML Namespces 1.1 http://www.w3.org/TR/xml-names11/ // Prefix ':' LocalPart // Prefix is an NCName // LocalPart is an NCName // // // An XML Name, minus the ":" // NCName ::= NCNameStartChar NCNameChar* // NCNameChar ::= NameChar - ':' // NCNameStartChar ::= NameStartChar - ':' // NameChar and NameSartChar defined in XML 1.1 // NameStartChar := ":" | [A-Z] | "_" | [a-z] | // [#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | // [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | // [#x3001-#xD7FF] | [#xF900-#xEFFFF] // NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 | // [#x0300-#x036F] | [#x203F-#x2040] TOKEN: { | > } /* SEPARATORS */ TOKEN : { < LPAREN: "(" > | < RPAREN: ")" > | < LBRACE: "{" > | < RBRACE: "}" > | < LBRACKET: "[" > | < RBRACKET: "]" > | < SEMICOLON: ";" > | < COMMA: "," > | < DOT: "." > } /* OPERATORS */ TOKEN : { // ASSIGN is here so that the tokenizer will process it and // the parser will flag an error. Otherwise, the tokenizer // gives a less helpful message. < ASSIGN: "=" > | < GT: ">" > | < LT: "<" > | < BANG: "!" > | < TILDE: "~" > | < HOOK: "?" > | < COLON: ":" > | < EQ: "==" > | < NEQ: "!=" > | < LE: "<=" > // Maybe: | "=>" > | < GE: ">=" > // Maybe: | "=<" > | < SC_OR: "||" > | < SC_AND: "&&" > //| < SC_XOR: "^^" > | < INCR: "++" > | < DECR: "--" > | < PLUS: "+" > | < MINUS: "-" > | < STAR: "*" > | < SLASH: "/" > | < BIT_AND: "&" > | < BIT_OR: "|" > | < BIT_XOR: "^" > | < REM: "%" > | < LSHIFT: "<<" > | < RSIGNEDSHIFT: ">>" > | < RUNSIGNEDSHIFT: ">>>" > // The tokens for string EQ and string NE are done before IDENTIFIER // to ensure that they are recognized as reserved words. | < STR_MATCH: ("=~"|"~~") > | < STR_NMATCH: "!~"> | < DATATYPE: "^^"> | < AT: "@"> } // **** Debug point void CompilationUnit() #void : {} { // The tests for trailing junk // but does not work for "//..." Query() } // Optional comma void CommaOpt() #void : {} { ()? } // **** Debug point void Query() : {} { SelectClause() ( SourceClause() )? ( TriplePatternClause() ) ? ( ConstraintClause() )? ( PrefixesClause() ) ? } void SelectClause() : {} { LOOKAHEAD(2) "*" } void SourceClause() : {} { ( | ) SourceSelector() (CommaOpt() SourceSelector() )* } void SourceSelector() : {} { // Must be quoted, must be a URL - no qnames at this point. URL() } void TriplePatternClause() : {} { TriplePattern() ( CommaOpt() TriplePattern() )* } void ConstraintClause() : {} { // This comma is not optional - must have comma or "AND" Expression() ( ( | ) Expression() )* } void TriplePattern() : {} { VarOrURI() CommaOpt() VarOrURI() CommaOpt() VarOrLiteral() } void VarOrURI() #void : {} { Var() | URI() } void VarOrLiteral() #void : {} { Var() | Literal() } void Var() : { Token t ;} { t = { jjtThis.setName(t.image) ; } // OLD // "?" Identifier() } void PrefixesClause() : { } { // Broken: comma should be optional here. But ... it isn't in practice. // PrefixDecl starts with an IDENITIFER, read in READ_IDENTIFER // token context but CommaOpt may swallow a comma in DEFAULT tokenizing mode // Generated parser seems not to work. PrefixDecl() ( CommaOpt() PrefixDecl() )* } void PrefixDecl() : {} { Identifier() QuotedURI() } /******************************************************************/ // Constraint syntax follows. // **** Debug point void Expression() #void : {} { ConditionalOrExpression() } void ConditionalOrExpression() #void : {} { ConditionalXorExpression() ( ConditionalXorExpression() #LogicalOr(2) )* } void ConditionalXorExpression() #void : {} { ConditionalAndExpression() // Skip this //( ConditionalAndExpression() #LogicalXor(2) )* } void ConditionalAndExpression() #void : {} { ValueLogical() ( ValueLogical() #LogicalAnd(2) )* } // End of boolean expressions /******************************************************************/ // Things that are not operations on boolean terms. void ValueLogical() #void : {} { StringEqualityExpression() } void StringEqualityExpression() #void : {} { NumericalLogical() ( NumericalLogical() #StringEqual(2) | NumericalLogical() #StringNotEqual(2) | PatternLiteral() #StringMatch(2) | PatternLiteral() #StringNoMatch(2) )* } // Expressions that involve comparing numbers. void NumericalLogical() #void : {} { InclusiveOrExpression() } void InclusiveOrExpression() #void : {} { ExclusiveOrExpression() ( ExclusiveOrExpression() #BitOr(2) )* } void ExclusiveOrExpression() #void : {} { AndExpression() ( AndExpression() #BitXor(2) )* } void AndExpression() #void : {} { ArithmeticCondition() ( ArithmeticCondition() #BitAnd(2) )* } void ArithmeticCondition() #void : {} { EqualityExpression() } void EqualityExpression() #void : {} { RelationalExpression() ( RelationalExpression() #Equal(2) | RelationalExpression() #NotEqual(2) )? } void RelationalExpression() #void : {} { NumericExpression() ( NumericExpression() #LessThan(2) | NumericExpression() #GreaterThan(2) | NumericExpression() #LessThanOrEqual(2) | NumericExpression() #GreaterThanOrEqual(2) )? } /******************************************************************/ // **** Debug point void NumericExpression () #void : {} { ShiftExpression() } void ShiftExpression() #void : {} { AdditiveExpression() ( AdditiveExpression() #LeftShift(2) | AdditiveExpression() #RightSignedShift(2) | AdditiveExpression() #RightUnsignedShift(2) )* } void AdditiveExpression() #void : {} { MultiplicativeExpression() ( MultiplicativeExpression() #Add(2) | MultiplicativeExpression() #Subtract(2) )* } void MultiplicativeExpression() #void : {} { UnaryExpression() ( UnaryExpression() #Multiply(2) | UnaryExpression() #Divide(2) | UnaryExpression() #Modulus(2) )* } void UnaryExpression() #void : {} { UnaryExpressionNotPlusMinus() | ( UnaryExpression() #UnaryPlus(1) | UnaryExpression() #UnaryMinus(1) ) } void UnaryExpressionNotPlusMinus() #void : {} { ( | ) UnaryExpression() #UnaryNot(1) | PrimaryExpression() } void PrimaryExpression() #void : {} { Var() | Literal() | FunctionCall() | // And this is why expressions are not typed by the parser! // Arbitrary lookahead of chars to see is the expression // is numeric or boolean. Expression() } void FunctionCall() : {} { "&" Identifier() ArgList() } void ArgList() : {} { VarOrLiteral() ( VarOrLiteral() ) * } /******************************************************************/ // Literals (as in query literls - any value in the query // Not "RDF literals". void Literal() #void : {} { URI() | NumericLiteral() | TextLiteral() | BooleanLiteral() | NullLiteral() } void NumericLiteral() : { Token t; } { t = { jjtThis.set(true, t.image) ; } | t = { jjtThis.set(false, t.image) ; } } void TextLiteral() : { Token t ; } { ( t = { jjtThis.set(t.image) ; } | t = { jjtThis.set(t.image) ; } ) // Optional lang tag and datatype. ( Identifier() ) ? ( URI() )? } TOKEN : { // This just creates a token symbol to read a character- we fill it in Java below. // Doing it like this means we use the parser token routines, not the tokenizer directly. } // SKIP : { " " | "\t" | "\n" | "\r" | "\f" } void PatternLiteral() : { Token t ; Token mtoken ; char marker ; int state = 0 ; } { // Skip whitespace { state = token_source.curLexState ; token_source.SwitchTo(READ_REGEX) ; } // Pattern language is: [m]/pattern/[i][m][s][x] // Note the leading "m" is optional because // is // often in conflict with URIs so the convenience // of, say "!", as a leading marker is good. // We do check that the RE isn't "....", i.e. markers, with // no "m", that might be a plain string. // Having a variable marker is tricky because we can't // define the pattern in fixed tokens. But the target is simple // so we just do it in java. // Skip to marker char (processing escapes) { while(true) { t = getNextToken() ; if ( t.kind == EOF ) throw new Error("End of file: expecting the start of a regular expression") ; marker = t.image.charAt(0) ; // Skip whitespace if ( marker != ' ' && marker != '\n' && marker != '\t' && marker != '\r' && marker != '\f' ) break ; } if ( marker != '/' ) { if ( marker == 'm' ) { //marker = jj_input_stream.readChar() ; t = getNextToken() ; marker = t.image.charAt(0) ; } else { if ( marker == '"' || marker == '\'' ) // Does not start m, and does start with " or ' throw new Error("Invalid regular expression (starts with ["+marker+"]) at line " + t.beginLine + " column " + t.beginColumn + "."); } // Sanity check - delimiter isn't an alphanumeric if ( Character.isLetterOrDigit(marker) ) throw new Error("Invalid start to regular expression at line " + t.beginLine + " column " + t.beginColumn + "."); } String patternString = "" ; boolean inEscape = false ; while(true) { char ch ; t = getNextToken() ; if ( t.kind == EOF ) throw new Error("End of file during regular expression") ; ch = t.image.charAt(0) ; if ( ch == '\n' || ch == '\r' || ch == '\f' ) throw new Error("Invalid regular expression at line " + t.beginLine + " column " + t.beginColumn + "."); if ( inEscape ) { if ( ch == 'n' ) ch = '\n' ; if ( ch == 't' ) ch = '\t' ; if ( ch == 'r' ) ch = '\r' ; if ( ch == 'b' ) ch = '\b' ; // But if we are escaping a character that is regex significant, // leave in the esacape. if ( ch != marker ) patternString = patternString + '\\' ; inEscape = false ; } else { // Escape? if ( ch == '\\' ) { inEscape = true ; continue ; } if ( ch == marker ) break ; } patternString = patternString + ch ; } // Read modifiers String modifiers = "" ; while(true) { char ch ; // End of file possible. t = getNextToken() ; if ( t.kind == EOF ) break ; ch = t.image.charAt(0) ; if ( ch == 'i' || ch == 'm' || ch == 's' || ch == 'x' ) modifiers = modifiers + ch ; else { jj_input_stream.backup(1) ; break ; } } token_source.SwitchTo(state) ; jjtThis.setPattern(patternString) ; if ( modifiers.length() > 0 ) jjtThis.setModifiers(modifiers) ; } } void BooleanLiteral() : { Token t ; } { t = { jjtThis.set(t.image) ; } } void NullLiteral() : {} { } void URL() : {} { QuotedURI() } void URI() #void : { } { QuotedURI() | QName() } void QName() : { Token t1, t2 ; } { // ":" () { jjtThis.set(token.image) ; } Identifier() { t1 = token ; } ":" (LOOKAHEAD(2) Identifier())? { t2 = token ; jjtThis.set(t1.image+":"+t2.image) ; } } // Should move this into tokens section // as part of getting the ordering right. void QuotedURI() : { Token tt = null ; int state = 0 ; } { "<" { state = token_source.curLexState ; token_source.SwitchTo(READ_URI) ; } tt = { jjtThis.set(tt.image) ; token_source.SwitchTo(state) ; } ">" } // Need to generalise this to include leading digits // That is, the second part of an NCName expressions void Identifier() : {} { ( // // And all keywords |