//------------------------------------------------------------------------------- // // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // //------------------------------------------------------------------------------- options { JAVA_UNICODE_ESCAPE=false; STATIC=false; DEBUG_PARSER=false; DEBUG_TOKEN_MANAGER=false; } PARSER_BEGIN(PDFParser) package org.apache.padaf.preflight.javacc; import java.io.IOException; import java.io.InputStream; import org.apache.padaf.preflight.HeaderParseException; import org.apache.padaf.preflight.BodyParseException; import org.apache.padaf.preflight.CrossRefParseException; import org.apache.padaf.preflight.TrailerParseException; import org.apache.padaf.preflight.PdfParseException; import org.apache.padaf.preflight.AbstractValidator; import static org.apache.padaf.preflight.ValidationConstants.*; public class PDFParser { public String pdfHeader = ""; public static boolean parse (InputStream is) throws IOException,ParseException { PDFParser parser = new PDFParser (is); parser.PDF(); return true; } public static void main (String [] args) { PDFParser parser; String filename = null; long initTime = 0; long parseTime = 0; long startTime = 0; long stopTime = 0; if (args.length == 0) { System.out.println("PDF Parser . . ."); parser = new PDFParser(System.in); } else if (args.length == 1) { filename = args[0]; System.out.println("PDF Parser : Reading from file " + filename + " . . ."); try { startTime = System.currentTimeMillis(); parser = new PDFParser(new java.io.FileInputStream(filename)); stopTime = System.currentTimeMillis(); initTime = stopTime - startTime; } catch (java.io.FileNotFoundException e) { System.out.println("PDF Parser : File " + filename + " not found."); return; } } else { System.out.println("PDF Parser : Usage is one of:"); System.out.println(" java PDFParser < inputfile"); System.out.println("OR"); System.out.println(" java PDFParser inputfile"); return; } try { startTime = System.currentTimeMillis(); parser.PDF(); stopTime = System.currentTimeMillis(); parseTime = stopTime - startTime; System.out.println("PDF Parser "); System.out.print(" PDF Parser parsed " + filename + " successfully in " + (initTime + parseTime) + " ms."); System.out.println(" Init. : " + initTime + " ms / parse time : " + parseTime + " ms"); } catch (ParseException e) { e.printStackTrace(System.out); System.out.println("PDF Parser : Encountered errors during parse."); } } } PARSER_END(PDFParser) // -------------------------------------------------- // ---- COMMON TOKENS // ---- OTHER_WHITE_SPACE : "\u0000"|"\u0009"|"\u000C" == NULL, HORIZONTAL TAB, FORM FEED // -------------------------------------------------- TOKEN : { < SPACE : " " > | < OTHER_WHITE_SPACE : "\u0000"|"\u0009"|"\u000C" > | < EOL : "\n"|"\r"|"\r\n"> } // -------------------------------------------------- // ---- HEADER TOKENS // ---- Even if the "ISO190005 App Notes" says that only PDF-1.[1-4] should be alloewd, // the "ISO 190005-1:2005" says : "The version number in the header of a PDF file nor // the value of the Version key in the document catalog dictionaryy shall be used in // determining whether a file is in accordance with this part of ISO 190005 " // -------------------------------------------------- TOKEN : { < PERCENT: "%" > | < PDFA_HEADER: "PDF-1."["1"-"6"] > | < BINARY_TAG : (["\u0080"-"\u00FF"]){2,} > } // -------------------------------------------------- // ---- BODY / OBJECT TOKENS // -------------------------------------------------- TOKEN : { |)+">"> | |)+">"> | )+"endobj"( < EOL > )> | : WithinStream } TOKEN : { | )+ ("."()*)? ) | ("."()+)) > | |["a"-"f"]|["A"-"F"]){2})+">"> | : WithinLIT | | | " , "%" , "\t" , "\n" , "\r"])+ > | | )*(|)("0"|["1"-"9"]()+)(|)"R" ) > | )*(|)("0"|["1"-"9"]()+)((|)"obj")( < EOL > )> | <#DIGITS : ["0"-"9"] > | <#LOWERLETTER : ["a"-"z"] > | <#UPPERLETTER : ["A"-"Z"] > } MORE : { <~["(", ")"]> } SKIP : { | | } TOKEN : { < END_LITERAL : ")" > | < INNER_START_LIT : "("(~[")","("])*> } // -- Content of Stream isn't check by the JavaCC Parser // -- Will be done by the PDFBox API MORE : { <~[]> } // -- End of Stream, return to the OBJECT Lexical State TOKEN : { : DEFAULT } // -------------------------------------------------- // ---- CROSS REFERENCE TABLE TOKENS // -------------------------------------------------- TOKEN : { < XREF_TAG : "xref" > : CrossRefTable } TOKEN : { < FULL_LINE : (){10} " " (){5} " " ["f"-"u"] > | < SUBSECTION_START : ( ) > | < #SUBSECTION_ENTRIES : ["1"-"9"]()* > | < #FIRST_OBJECT_NUMBER : ()+ > | < TRAILER_TAG: "trailer" > : DEFAULT } // -------------------------------------------------- // ---- TRAILER / Dictionary TOKENS // -------------------------------------------------- TOKEN : { < START_DICTONNARY : "<<" > | < END_DICTONNARY : ">>" > } TOKEN : { < STARTXREF_TAG : "startxref" > : WithinTrailer } TOKEN : { < OBJ_NUMBER: ()+ > | < EOF_TRAILER_TAG : "%%EOF" > : DEFAULT } void indirect_object() : {} { object_content() } void object_content() : {} { (||)* ( ( | {checkNumericLength();} | {checkStringHexLength();} | start_literal() | array_of_object() | {checkNameLength();} | )(|)* | ( dictionary_object() (|)* ( () * { int i = token.image.indexOf(tokenImage[END_STREAM].substring(1,tokenImage[END_STREAM].length()-1)); if (!(token.image.charAt(i-1) == 0x0a || token.image.charAt(i-1) == 0x0d)) { throw new PdfParseException("Expected EOL before \"endstream\"", ERROR_SYNTAX_STREAM_DELIMITER); } } (|)* )? ) ) } void array_of_object () : {int counter = 0;} { ( {++counter;} | {++counter; checkNumericLength();} | {++counter;checkStringHexLength();} | array_of_object() {++counter;} | dictionary_object () {++counter;} | {++counter; checkNameLength();} | {++counter;} | {++counter;} | start_literal () {++counter;} | || {/* space isn't an element */ } ) * {if(counter > MAX_ARRAY_ELEMENTS) throw new PdfParseException("Array too long : " + counter, ERROR_SYNTAX_ARRAY_TOO_LONG);} } void start_literal () : {} { literal() } JAVACODE void literal() { Token currentToken = null; int nesting = 1; int literalLength = 0; while(true) { Token previous = getToken(0); currentToken = getToken(1); if (currentToken.kind == 0 ){ throw new ParseException("EOF reach before the end of the literal string."); } literalLength += currentToken.image.getBytes().length; if ( currentToken.kind == OBJ_STRING_LIT ) { jj_consume_token(OBJ_STRING_LIT); if (previous != null && previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') { ++nesting; } } else if ( currentToken.kind == INNER_START_LIT ) { jj_consume_token(INNER_START_LIT); if (previous != null && previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') { ++nesting; } } else if ( currentToken.kind == END_LITERAL ) { if (previous != null && previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') { --nesting; } jj_consume_token(END_LITERAL); if (nesting == 0) { this.token_source.curLexState = PDFParserConstants.DEFAULT; break; } } else { currentToken = getNextToken(); } } if (literalLength > MAX_STRING_LENGTH) { throw new PdfParseException("Literal String too long", ERROR_SYNTAX_LITERAL_TOO_LONG); } } JAVACODE void checkNameLength() throws ParseException { if (token != null && token.image.getBytes().length > MAX_NAME_SIZE) { throw new PdfParseException("Object Name is too long : " + token.image.getBytes().length, ERROR_SYNTAX_NAME_TOO_LONG); } else { // Nothing to do } } JAVACODE void checkMagicNumberLength() throws ParseException { if (token != null && token.image.getBytes().length < 4) { throw new PdfParseException("Not enough bytes after the Header (at least 4 bytes should be present with a value bigger than 127) : " + token.image, ERROR_SYNTAX_HEADER); } else { // Nothing to do } } JAVACODE void checkStringHexLength() throws ParseException { if (token != null && ((token.image.length()-2)/2) > MAX_STRING_LENGTH) { throw new PdfParseException("Object String Hexa is toot long", ERROR_SYNTAX_HEXA_STRING_TOO_LONG); } else { // Nothing to do } } JAVACODE void checkNumericLength() throws ParseException { if (token != null) { String num = token.image; try { long numAsLong = Long.parseLong(num); if (numAsLong > Integer.MAX_VALUE || numAsLong < Integer.MIN_VALUE) { throw new PdfParseException("Numeric is too long or too small: " + num, ERROR_SYNTAX_NUMERIC_RANGE); } } catch (NumberFormatException e) { // may be a real, go to the next check try { Double real = Double.parseDouble(num); if (real > MAX_POSITIVE_FLOAT || real < MAX_NEGATIVE_FLOAT) { throw new PdfParseException("Float is too long or too small: " + num, ERROR_SYNTAX_NUMERIC_RANGE); } } catch (NumberFormatException e2) { // should never happen throw new PdfParseException("Numeric has invalid format " + num, ERROR_SYNTAX_NUMERIC_RANGE); } } } else { // Nothing to do } } void dictionary_object () : {int tokenNumber = 0;} { ( || )* ( ( {++tokenNumber; checkNameLength();} ( || )* ( | {checkNameLength();} | {checkNumericLength();} | {checkStringHexLength();} | start_literal() | array_of_object() | dictionary_object () | | ) {++tokenNumber;} ) ( || )* )* { int entries = (int)(tokenNumber / 2); if (entries > MAX_DICT_ENTRIES) { throw new PdfParseException("Too Many Entries In Dictionary : " + entries, ERROR_SYNTAX_TOO_MANY_ENTRIES); } } } void PDF_header() throws HeaderParseException : {} { try { { pdfHeader = token.image;} ( < EOL > ) checkMagicNumberLength() ( < EOL > ) } catch (ParseException e) { throw new HeaderParseException (e); } catch (TokenMgrError e) { throw new HeaderParseException (e.getMessage()); } } void PDF_body() throws BodyParseException : {} { try { ( (|)+ () )? ( indirect_object() (|)* ()? ) + } catch (ParseException e) { throw new BodyParseException (e); } catch (TokenMgrError e) { throw new BodyParseException (e.getMessage()); } } void PDF_cross_ref_table() throws CrossRefParseException : {} { try { ( < EOL > ) ( (( ) * < EOL > ) ( ( ) * ( < EOL > ) ) + )+ } catch (ParseException e) { throw new CrossRefParseException (e); } catch (TokenMgrError e) { throw new CrossRefParseException (e.getMessage()); } } void PDF_trailer_dictionnary() throws TrailerParseException : {} { try { ( ) dictionary_object() ()* } catch (ParseException e) { throw new TrailerParseException (e); } catch (TokenMgrError e) { throw new TrailerParseException (e.getMessage()); } } void PDF_Trailer_XRefOffset() throws TrailerParseException : {} { try { ( ) ( ) ( ) ? } catch (ParseException e) { throw new TrailerParseException (e); } catch (TokenMgrError e) { throw new TrailerParseException (e.getMessage()); } } void PDF_linearized_modified() throws PdfParseException : { int foundXref=0; int foundTrailer=0; } { try { ( PDF_body() ( PDF_cross_ref_table() {foundXref++;} PDF_trailer_dictionnary() {foundTrailer++;} )? PDF_Trailer_XRefOffset() )+ { boolean expectedXRefAndTrailer = pdfHeader.matches("PDF-1\\.[1-4]"); if (expectedXRefAndTrailer && (foundXref <= 0 || foundTrailer <= 0)) { throw new TrailerParseException ("Missing Xref table or Trailer keyword in the given PDF."); } } } catch (PdfParseException e) { throw e; } catch (ParseException e) { throw new TrailerParseException (e); } catch (TokenMgrError e) { throw new TrailerParseException (e.getMessage()); } } // ------------------------------------------- // ---- The PDF grammar productions start here // ------------------------------------------- void PDF() throws PdfParseException : {} { PDF_header() PDF_linearized_modified() }