SPARQL Grammar

#!/usr/bin/perl # Grammar into HTML # Read in sparql.txt and the tokens.txt file ## ToDo: ## Check tokens exist and are used ## Validate if ( $#ARGV != 1 ) { print STDERR "Usage: grammar.txt tokens.txt\n" ; exit 1 ; } $/ = undef ; # Just table or full page. $TABLE = 1 ; $grammarFile = $ARGV[0] ; $tokensFile = $ARGV[1] ; $grammar = &readFile($grammarFile) ; $tokens = &readFile($tokensFile) ; $grammar =~ s!DOCUMENT START!! ; # $grammar =~ s!NON-TERMINALS!! ; $grammar =~ s!DOCUMENT END!! ; $grammar =~ s!TOKENS.*NON-TERMINALS!!s ; $grammar =~ s!//.*!!g ; $grammar =~ s!\r!!g ; # remove leading whitespace $grammar =~ s!^[\n\s]*!\n! ; # Merge alts $grammar =~ s!\n\s*\|!\ |!g ; $tokens =~ s!//.*!!g ; $tokens =~ s!\r!!g ; ## Grammar #print "GRAMMAR\n" ; @g = split(/\n\s*/, $grammar) ; @rules = () ; %ruleMap = () ; %tokenMap = () ; %inline = () ; # Grammar rules # Direct from "jjdoc -TEXT=true" for $g (@g) { ($rulename, $rulebody) = split(/:=/,$g) ; $rulename =~ s!^\s*!! ; $rulename =~ s!\s*$!! ; $rulebody =~ s!^\s*!! ; $rulebody =~ s!\s*$!! ; # Remove outer brackets # $rulebody =~ s!^\((.*)\)$!$1! ; # Remove <> around tokens in grammar. ## Now done very late (as <>) in fixups. ## $rulebody =~ s/\<(\w+)\>/$1/g ; # Leave in - so tokens distinguished from rules next if $rulename eq '' ; #next if $rulebody eq '' ; # Skip the root rule. next if ( $rulename eq 'CompilationUnit' ) ; $rulebody = 'Perl 5 regular expression' if ( $rulename eq 'PatternLiteral' ) ; push @rules, $rulename ; warn "Duplicate rule (grammar): $rulename\n" if defined($ruleMap{$rulename}) ; $ruleMap{$rulename} = $rulebody ; ## print "----------\n" ; ## print $rulename,"\n" ; ## print $rulebody,"\n" ; } # Tokens # Produced by "jj2tokens" # Hand edited to indicate the inlines $tokens =~ s/\n+/\n/g ; $tokens =~ s/^\n// ; @t = split(/\n(?=\<|\[)/, $tokens) ; for $t (@t) { ($tokenname,$tokenbody) = split(/::=/, $t) ; $tokenname =~ s!^\s*!! ; $tokenname =~ s!\s*$!! ; ## # remove <> around tokens ## Do very late as a formatting fix up. ## $tokenname =~ s/^\$// ; $tokenname =~ s/#// ; $tokenbody =~ s!^\s*!! ; $tokenbody =~ s!\s*$!! ; # <> round tokens # Remove at last minute. # Remove outer () # $tokenbody =~ s!^\((.*)\)$!$1! ; # Inline? if ( $tokenname =~ /^\[\<\w*\>\]/ ) { warn "Duplicate inline (token): $tokenname\n" if defined($inline{$tokenname}) ; $tokenname =~ s/^\[//g ; $tokenname =~ s/\]$//g ; $tokenbody =~ s/"/'/g ; # '" -- But not literal " -- how? $tokenbody =~ s/\<\>\'\{\}/\<\>\"\{\}/ ; # '" IRI fixup $inline{$tokenname} = $tokenbody ; #print "INLINE: ",$tokenname," => ",$tokenbody,"\n" ; } else { push @rules, $tokenname ; warn "Duplicate rule (token): $tokenname\n" if defined($tokenMap{$tokenname}) ; $ruleMap{$tokenname} = $tokenbody ; } } # Table if ( ! $TABLE ) { print "\n" ; print "\n" ; print "\n" ; print "\n"; print "\n"; print "SPARQL Grammar\n" ; print "\n" ; print "\n"; print "\n"; print "\n" ; } print "

\n" ; print " \n" ; $ruleNum = 0 ; for $r (@rules) { $DEBUG = 0 ; $ruleNum++ ; $rulename = $r ; $rulebody = $ruleMap{$rulename} ; ## $DEBUG = 1 if ( $rulename =~ /Prolog/ ) ; $rb = $rulebody ; if ( $DEBUG ) { print STDERR "\n" ; print STDERR "Rule: $rulename\n" ; print STDERR "Body: $rulebody\n" ; } ## Do before '||' substitution # Not perfect - some fixups later. #$rb =~ s%\|%\
\|%g ; # Escape HTML chars before adding markup. $rb = esc($rb) ; # Inlines for $k (keys %inline) { $s = span('token', $inline{$k}) ; $k = esc($k) ; # Assumes escaped <> round tokens. $k = quotemeta $k ; $rb =~ s/$k/$s/g ; } if ( $DEBUG ) { print STDERR "After inlining\n" ; print STDERR $rb,"\n" ; ; } # Add hrefs - issue if one is a substring of another \W helps. for $k (keys %ruleMap) { $s = href("r-".$k,$k) ; $k = esc($k) ; $k = quotemeta $k ; ## if ( $DEBUG ) ## { ## print STDERR "K:$k\n" ; ## } $rb =~ s/(?=\W)(\s*)$k(\s*)(?=\W)/$1$s$2/g ; $rb =~ s/^$k(\s*)(?=\W)/$s$1/g ; $rb =~ s/(?=\W)(\s*)$k$/$1$s/g ; $rb =~ s/^$k$/$s/g ; } if ( $DEBUG ) { print STDERR "After hrefs\n" ; print STDERR $rb,"\n" ; ; } #exit if $ruleNum > 2 ; $rn = anchor("r-".$rulename, $rulename) ; $rn = fixupHead($rn) ; if($rulename eq 'IRIREF') { print "

\n" ; print "

\n" ; print "

Productions for terminals:

\n" ; print "

\n" ; print " \n" ; } print "\n" ; print "\n" ; $rlabel = '[' . $ruleNum . '] ' ; print " \n" ; #print " \n" ; print " \n" ; print " \n" ; $rb = fixupRule($rulename, $rb) ; print " \n" ; print "\n" ; # $rule{$rulename, $rulebody) ; # print $rulename , "\n" ; } print "

",code('gRuleLabel', $rlabel),"

",span('gRuleHead', $rn),"

",code('gRuleHead',$rn),"

::=

",code('gRuleBody',$rb),"

\n" ; print "

\n" ; if ( !$TABLE ) { print "\n" ; print "\n" ; print "\n" ; } sub readFile { my $f = $_[0] ; open(F, "$f") || die "$!"; my $s = ; return $s ; } sub esc { my $s = $_[0] ; $s =~ s/&/&/g ; $s =~ s//>/g ; return $s ; } sub span { my $c = $_[0] ; my $t = $_[1] ; $t = esc($t) ; my $s = '' . $t . '' ; return $s ; } sub href { my $a = $_[0] ; my $t = $_[1] ; $a = sane($a) ; $t = esc($t) ; my $s = '' . $t . '' ; return $s ; } sub anchor { my $a = $_[0] ; my $t = $_[1] ; $a = sane($a) ; $t = esc($t) ; my $s = '' . $t . '' ; return $s ; } sub sane { my $a = $_[0] ; $a =~ s/\W//g ; return $a ; } sub code { my $c = $_[0] ; my $t = $_[1] ; return '' . $t . '' ; } sub fixupHead { my $head = $_[0] ; # Remove <> around tokens. $head =~ s/<(\w+)>/$1/g ; return $head ; } sub fixupRule { my $head = $_[0] ; my $body = $_[1] ; # Remove unnecessary () $body =~ s/\(\s*([^()| ]*) \)/$1/g ; ## if ( $body =~ m!\(\s+(\]*\>[^<>]*\)\s+\)! ) ## { ## $b = $body ; ## print "================================\n" ; ## print STDERR "$b\n" ; ## print STDERR "--------\n" ; ## $b =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ; ## $b =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ; ## print STDERR "$b\n" ; ## print STDERR "=====\n" ; ## print STDERR "\n" ; ## } # Remove outer matching () where there are no inner () $body =~ s/^\(\s+([^\(]*)\s+\)$/$1/ ; # ( A )* => A* and for + and ? where A is a linked or spanned object $body =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ; $body =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ; # There aren't any of these ## $body =~ s!\(\s+(\S*)\s+\)!$1!g ; # Remove <> around tokens. $body =~ s/<(\w+)>/$1/g ; # Specials # Split long bodies if ( $head eq "CallExpression" || $head eq "UnaryExpression" || $head eq "" || $head eq "PatternElement" || $head eq "BuiltInCall" ) { $body =~ s%\|%\
\|%g ; $body =~ s/^\s+// ; $body = " ".$body ; } if ( $head eq "Aggregate" ) { # Strip outer () $body =~ s/^\(\s*(.*)\s*\)$/$1/ ; ## Be careful of nested | in COUNT $body =~ s%(\| \$1%g ; $body =~ s/^\s+// ; $body = " ".$body ; } if ( $head eq "BuiltInCall" ) { # Undo
for BNODE, RAND #
| NIL ) $body =~ s%\
\| *\NIL\%\| \NIL\%g ; } if ( $head eq "RelationalExpression" || $head eq "AdditiveExpression" || $head eq "MultiplicativeExpression" || $head eq "ConditionalOrExpression") { $body =~ s%\*\(%
\(% ; } # These failed the outer () test because they have nested () in them if ( $head eq "QueryPattern" || $head eq "OrderCondition" ) { # Remove outer () $body =~ s/^\((.*)\)$/$1/ ; } if ( $head eq "Query" ) { $body =~ s! \(!
\(! ; $body =~ s!\) !\)
! ; } if ( $head =~ m/(Select|Construct|Describe|Ask)Query/ ) { # Put a line break before the DatasetClause # DatasetClause $c = 'DatasetClause' ; $c = quotemeta $c ; # Expects the dataset clause to be unbracketted $body =~ s!(\(\s*$c)!
$1! ; } if ( $head eq "OrderCondition" ) { $body =~ s!\)\s*\|\s*\(!\)
\| \(! ; $body = " ".$body ; } #Rules where an outer () is unnecessary. if ( $head eq "GroupCondition" || $head eq "LimitOffsetClauses" || $head eq "GraphOrDefault" || $head eq "ArgList" || $head eq "ExpressionList" || $head eq "PathPrimary" || $head eq "PathMod" || $head eq "PathPrimary" || $head eq "PathNegatedPropertySet" || $head eq "PathOneInPropertySet") { $body =~ s/^\(\s*(.*)\s*\)$/$1/ ; } return $body ; }