#!/usr/bin/perl # Grammar into HTML # Read in sparql.txt and the tokens.txt file ## ToDo: ## Check tokens exist and are used ## Validate if ( $#ARGV != 1 ) { print STDERR "Usage: grammar.txt tokens.txt\n" ; exit 1 ; } $/ = undef ; # Just table or full page. $TABLE = 1 ; $grammarFile = $ARGV[0] ; $tokensFile = $ARGV[1] ; ## $grammarFile = 's.txt' ; ## $tokensFile = 't.txt' ; $grammar = &readFile($grammarFile) ; $tokens = &readFile($tokensFile) ; $grammar =~ s!DOCUMENT START!! ; # $grammar =~ s!NON-TERMINALS!! ; $grammar =~ s!DOCUMENT END!! ; $grammar =~ s!TOKENS.*NON-TERMINALS!!s ; $grammar =~ s!//.*!!g ; $grammar =~ s!\r!!g ; # remove leading whitespace $grammar =~ s!^[\n\s]*!\n! ; # Merge alts $grammar =~ s!\n\s*\|!\ |!g ; $tokens =~ s!//.*!!g ; $tokens =~ s!\r!!g ; ## Grammar #print "GRAMMAR\n" ; @g = split(/\n\s*/, $grammar) ; @rules = () ; %ruleMap = () ; %tokenMap = () ; %inline = () ; # Grammar rules # Direct from "jjdoc -TEXT=true" for $g (@g) { ($rulename, $rulebody) = split(/:=/,$g) ; $rulename =~ s!^\s*!! ; $rulename =~ s!\s*$!! ; $rulebody =~ s!^\s*!! ; $rulebody =~ s!\s*$!! ; # Remove outer brackets # $rulebody =~ s!^\((.*)\)$!$1! ; # Remove <> around tokens in grammar. ## Now done very late (as <>) in fixups. ## $rulebody =~ s/\<(\w+)\>/$1/g ; # Leave in - so tokens distinguished from rules next if $rulename eq '' ; #next if $rulebody eq '' ; # Skip the root rule. next if ( $rulename eq 'CompilationUnit' ) ; $rulebody = 'Perl 5 regular expression' if ( $rulename eq 'PatternLiteral' ) ; push @rules, $rulename ; warn "Duplicate rule (grammar): $rulename\n" if defined($ruleMap{$rulename}) ; $ruleMap{$rulename} = $rulebody ; ## print "----------\n" ; ## print $rulename,"\n" ; ## print $rulebody,"\n" ; } # Tokens # Produced by "jj2tokens" # Hand edited to indicate the inlines $tokens =~ s/\n+/\n/g ; $tokens =~ s/^\n// ; @t = split(/\n(?=\<|\[)/, $tokens) ; for $t (@t) { ($tokenname,$tokenbody) = split(/::=/, $t) ; $tokenname =~ s!^\s*!! ; $tokenname =~ s!\s*$!! ; ## # remove <> around tokens ## Do very late as a formatting fix up. ## $tokenname =~ s/^\/ ; ## $tokenname =~ s/\>$// ; $tokenname =~ s/#// ; $tokenbody =~ s!^\s*!! ; $tokenbody =~ s!\s*$!! ; # <> round tokens # Remove at last minute. # Remove outer () # $tokenbody =~ s!^\((.*)\)$!$1! ; # Inline? if ( $tokenname =~ /^\[\<\w*\>\]/ ) { warn "Duplicate inline (token): $tokenname\n" if defined($inline{$tokenname}) ; $tokenname =~ s/^\[//g ; $tokenname =~ s/\]$//g ; $tokenbody =~ s/"/'/g ; # '" -- But not literal " -- how? $tokenbody =~ s/\<\>\'\{\}/\<\>\"\{\}/ ; # '" IRI fixup $inline{$tokenname} = $tokenbody ; #print "INLINE: ",$tokenname," => ",$tokenbody,"\n" ; } else { push @rules, $tokenname ; warn "Duplicate rule (token): $tokenname\n" if defined($tokenMap{$tokenname}) ; $ruleMap{$tokenname} = $tokenbody ; } } # Table if ( ! $TABLE ) { print "\n" ; print "\n" ; print "\n" ; print "\n"; print "
\n"; print "Productions for terminals:
\n" ; print "",code('gRuleLabel', $rlabel)," | \n" ; #print "",span('gRuleHead', $rn)," | \n" ; print "",code('gRuleHead',$rn)," | \n" ; print "::= | \n" ; $rb = fixupRule($rulename, $rb) ; print "",code('gRuleBody',$rb)," | \n" ; print "
' . $t . '
' ;
}
sub fixupHead
{
my $head = $_[0] ;
# Remove <> around tokens.
$head =~ s/<(\w+)>/$1/g ;
return $head ;
}
sub fixupRule
{
my $head = $_[0] ;
my $body = $_[1] ;
# Remove unnecessary ()
$body =~ s/\(\s*([^()| ]*) \)/$1/g ;
## if ( $body =~ m!\(\s+(\]*\>[^<>]*\)\s+\)! )
## {
## $b = $body ;
## print "================================\n" ;
## print STDERR "$b\n" ;
## print STDERR "--------\n" ;
## $b =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ;
## $b =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ;
## print STDERR "$b\n" ;
## print STDERR "=====\n" ;
## print STDERR "\n" ;
## }
# Remove outer matching () where there are no inner ()
$body =~ s/^\(\s+([^\(]*)\s+\)$/$1/ ;
# ( A )* => A* and for + and ? where A is a linked or spanned object
$body =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ;
$body =~ s!\(\s+(\]*\>[^<>]*\)\s+\)!$1!g ;
# There aren't any of these
## $body =~ s!\(\s+(\S*)\s+\)!$1!g ;
# Remove <> around tokens.
$body =~ s/<(\w+)>/$1/g ;
# Specials
# Split long bodies
if ( $head eq "CallExpression" ||
$head eq "UnaryExpression" ||
$head eq "