QuickParser
presented in
* PDFBOX-1104 by Jeremy Villalobos.
*/
public class COSParser extends BaseParser implements ICOSParser
{
private static final String PDF_HEADER = "%PDF-";
private static final String FDF_HEADER = "%FDF-";
private static final String PDF_DEFAULT_VERSION = "1.4";
private static final String FDF_DEFAULT_VERSION = "1.0";
private static final char[] STARTXREF = { 's','t','a','r','t','x','r','e','f' };
private static final byte[] ENDSTREAM = { E, N, D, S, T, R, E, A, M };
private static final byte[] ENDOBJ = { E, N, D, O, B, J };
protected static final long MINIMUM_SEARCH_OFFSET = 6;
private static final int STRMBUFLEN = 2048;
private final byte[] strmBuf = new byte[ STRMBUFLEN ];
private AccessPermission accessPermission;
private InputStream keyStoreInputStream = null;
@SuppressWarnings({"squid:S2068"})
private String password = "";
private String keyAlias = null;
/**
* The range within the %%EOF marker will be searched.
* Useful if there are additional characters after %%EOF within the PDF.
*/
public static final String SYSPROP_EOFLOOKUPRANGE =
"org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";
/**
* How many trailing bytes to read for EOF marker.
*/
private static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
/**
* EOF-marker.
*/
private static final char[] EOF_MARKER = { '%', '%', 'E', 'O', 'F' };
/**
* obj-marker.
*/
private static final char[] OBJ_MARKER = { 'o', 'b', 'j' };
/**
* file length.
*/
private final long fileLen;
/**
* is parser using auto healing capacity ?
*/
private boolean isLenient = true;
protected boolean initialParseDone = false;
private boolean trailerWasRebuild = false;
private BruteForceParser bruteForceParser = null;
private PDEncryption encryption = null;
private final MapWe check that new value is at least 16. However for practical use cases this value should not be lower than * 1000; even 2000 was found to not be enough in some cases where some trailing garbage like HTML snippets followed * the EOF marker.
* ** In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined this value will be set on initialization but * can be overwritten later. *
* * @param byteCount number of trailing bytes */ public void setEOFLookupRange(int byteCount) { if (byteCount > 15) { readTrailBytes = byteCount; } } /** * Read the trailer information and provide a COSDictionary containing the trailer information. * * @return a COSDictionary containing the trailer information * @throws IOException if something went wrong */ protected COSDictionary retrieveTrailer() throws IOException { COSDictionary trailer = null; boolean rebuildTrailer = false; try { // parse startxref // TODO FDF files don't have a startxref value, so that rebuildTrailer is triggered long startXRefOffset = getStartxrefOffset(); if (startXRefOffset > -1) { XrefParser xrefParser = new XrefParser(this); trailer = xrefParser.parseXref(document, startXRefOffset); xrefTable.putAll(xrefParser.getXrefTable()); } else { rebuildTrailer = isLenient(); } } catch (IOException exception) { if (isLenient()) { rebuildTrailer = true; } else { throw exception; } } // check if the trailer contains a Root object if (trailer != null && trailer.getItem(COSName.ROOT) == null) { rebuildTrailer = isLenient(); } if (rebuildTrailer) { // reset cross reference table xrefTable.clear(); trailer = getBruteForceParser().rebuildTrailer(xrefTable); trailerWasRebuild = true; } else { // prepare decryption if necessary prepareDecryption(); // don't use the getter as it creates an instance of BruteForceParser if (bruteForceParser != null && bruteForceParser.bfSearchTriggered()) { getBruteForceParser().bfSearchForObjStreams(xrefTable); } } return trailer; } /** * Looks for and parses startxref. We first look for last '%%EOF' marker (within last * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find *startxref
.
*
* @return the offset of StartXref
* @throws IOException If something went wrong.
*/
private long getStartxrefOffset() throws IOException
{
byte[] buf;
long skipBytes;
// read trailing bytes into buffer
try
{
final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
buf = new byte[trailByteCount];
skipBytes = fileLen - trailByteCount;
source.seek(skipBytes);
int off = 0;
int readBytes;
while (off < trailByteCount)
{
readBytes = source.read(buf, off, trailByteCount - off);
// in order to not get stuck in a loop we check readBytes (this should never happen)
if (readBytes < 1)
{
throw new IOException(
"No more bytes to read for trailing buffer, but expected: "
+ (trailByteCount - off));
}
off += readBytes;
}
}
finally
{
source.seek(0);
}
// find last '%%EOF'
int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);
if (bufOff < 0)
{
if (isLenient)
{
// in lenient mode the '%%EOF' isn't needed
bufOff = buf.length;
LOG.debug("Missing end of file marker '{}'", new String(EOF_MARKER));
}
else
{
throw new IOException("Missing end of file marker '" + new String(EOF_MARKER) + "'");
}
}
// find last startxref preceding EOF marker
bufOff = lastIndexOf(STARTXREF, buf, bufOff);
if (bufOff < 0)
{
throw new IOException("Missing 'startxref' marker.");
}
else
{
return skipBytes + bufOff;
}
}
/**
* Searches last appearance of pattern within buffer. Lookup before _lastOff and goes back until 0.
*
* @param pattern pattern to search for
* @param buf buffer to search pattern in
* @param endOff offset (exclusive) where lookup starts at
*
* @return start offset of pattern within buffer or -1
if pattern could not be found
*/
private int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff)
{
final int lastPatternChOff = pattern.length - 1;
int bufOff = endOff;
int patOff = lastPatternChOff;
char lookupCh = pattern[patOff];
while (--bufOff >= 0)
{
if (buf[bufOff] == lookupCh)
{
if (--patOff < 0)
{
// whole pattern matched
return bufOff;
}
// matched current char, advance to preceding one
lookupCh = pattern[patOff];
}
else if (patOff < lastPatternChOff)
{
// no char match but already matched some chars; reset
patOff = lastPatternChOff;
lookupCh = pattern[patOff];
}
}
return -1;
}
/**
* Return true if parser is lenient. Meaning auto healing capacity of the parser are used.
*
* @return true if parser is lenient
*/
public boolean isLenient()
{
return isLenient;
}
/**
* Change the parser leniency flag.
*
* This method can only be called before the parsing of the file.
*
* @param lenient try to handle malformed PDFs.
*
*/
protected void setLenient(boolean lenient)
{
if (initialParseDone)
{
throw new IllegalArgumentException("Cannot change leniency after parsing");
}
this.isLenient = lenient;
}
@Override
public COSBase dereferenceCOSObject(COSObject obj) throws IOException
{
long currentPos = source.getPosition();
COSObjectKey key = obj.getKey();
COSBase parsedObj = parseObjectDynamically(key, false);
if (parsedObj != null)
{
parsedObj.setDirect(false);
parsedObj.setKey(key);
}
if (currentPos > 0)
{
source.seek(currentPos);
}
return parsedObj;
}
@Override
public RandomAccessReadView createRandomAccessReadView(long startPosition, long streamLength)
throws IOException
{
return source.createView(startPosition, streamLength);
}
/**
* Parse the object for the given object key.
*
* @param objKey key of object to be parsed
* @param requireExistingNotCompressedObj if true
the object to be parsed must be defined in xref
* (comment: null objects may be missing from xref) and it must not be a compressed object within object stream
* (this is used to circumvent being stuck in a loop in a malicious PDF)
*
* @return the parsed object (which is also added to document object)
*
* @throws IOException If an IO error occurs.
*/
private synchronized COSBase parseObjectDynamically(COSObjectKey objKey,
boolean requireExistingNotCompressedObj) throws IOException
{
COSObject pdfObject = document.getObjectFromPool(objKey);
if (!pdfObject.isObjectNull())
{
return pdfObject.getObject();
}
Long offsetOrObjstmObNr = getObjectOffset(objKey, requireExistingNotCompressedObj);
COSBase referencedObject = null;
if (offsetOrObjstmObNr != null)
{
if (offsetOrObjstmObNr > 0)
{
referencedObject = parseFileObject(offsetOrObjstmObNr, objKey);
}
else
{
// xref value is object nr of object stream containing object to be parsed
// since our object was not found it means object stream was not parsed so far
referencedObject = parseObjectStreamObject(-offsetOrObjstmObNr, objKey);
}
}
if (referencedObject == null || referencedObject instanceof COSNull)
{
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
// or some other issue with dereferencing
// remove parser to avoid endless recursion
pdfObject.setToNull();
}
return referencedObject;
}
private Long getObjectOffset(COSObjectKey objKey, boolean requireExistingNotCompressedObj)
throws IOException
{
// read offset or object stream object number from xref table
Long offsetOrObjstmObNr = document.getXrefTable().get(objKey);
// maybe something is wrong with the xref table -> perform brute force search for all objects
if (offsetOrObjstmObNr == null && isLenient)
{
offsetOrObjstmObNr = getBruteForceParser().getBFCOSObjectOffsets().get(objKey);
if (offsetOrObjstmObNr != null)
{
LOG.debug("Set missing offset {} for object {}", offsetOrObjstmObNr, objKey);
document.getXrefTable().put(objKey, offsetOrObjstmObNr);
}
}
// test to circumvent loops with broken documents
if (requireExistingNotCompressedObj
&& (offsetOrObjstmObNr == null || offsetOrObjstmObNr <= 0))
{
throw new IOException("Object must be defined and must not be compressed object: "
+ objKey.getNumber() + ":" + objKey.getGeneration());
}
return offsetOrObjstmObNr;
}
private COSBase parseFileObject(Long objOffset, final COSObjectKey objKey)
throws IOException
{
// jump to the object start
source.seek(objOffset);
// an indirect object starts with the object number/generation number
final long readObjNr = readObjectNumber();
final int readObjGen = readGenerationNumber();
readObjectMarker();
// consistency check
if (readObjNr != objKey.getNumber() || readObjGen != objKey.getGeneration())
{
throw new IOException("XREF for " + objKey.getNumber() + ":"
+ objKey.getGeneration() + " points to wrong object: " + readObjNr
+ ":" + readObjGen + " at offset " + objOffset);
}
skipSpaces();
COSBase parsedObject = parseDirObject();
if (parsedObject != null)
{
parsedObject.setDirect(false);
parsedObject.setKey(objKey);
}
String endObjectKey = readString();
if (endObjectKey.equals(STREAM_STRING))
{
source.rewind(endObjectKey.getBytes(StandardCharsets.ISO_8859_1).length);
if (parsedObject instanceof COSDictionary)
{
COSStream stream = parseCOSStream((COSDictionary) parsedObject);
if (securityHandler != null)
{
securityHandler.decryptStream(stream, objKey.getNumber(), objKey.getGeneration());
}
parsedObject = stream;
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream
// forms a complete stream object
throw new IOException("Stream not preceded by dictionary (offset: "
+ objOffset + ").");
}
skipSpaces();
endObjectKey = readLine();
// we have case with a second 'endstream' before endobj
if (!endObjectKey.startsWith(ENDOBJ_STRING) && endObjectKey.startsWith(ENDSTREAM_STRING))
{
endObjectKey = endObjectKey.substring(9).trim();
if (endObjectKey.isEmpty())
{
// no other characters in extra endstream line
// read next line
endObjectKey = readLine();
}
}
}
else if (securityHandler != null)
{
securityHandler.decrypt(parsedObject, objKey.getNumber(), objKey.getGeneration());
}
if (!endObjectKey.startsWith(ENDOBJ_STRING))
{
if (isLenient)
{
LOG.warn("Object ({}:{}) at offset {} does not end with 'endobj' but with '{}'",
readObjNr, readObjGen, objOffset, endObjectKey);
}
else
{
throw new IOException("Object (" + readObjNr + ":" + readObjGen
+ ") at offset " + objOffset
+ " does not end with 'endobj' but with '" + endObjectKey + "'");
}
}
return parsedObject;
}
/**
* Parse the object with the given key from the object stream with the given number.
*
* @param objstmObjNr the number of the offset stream
* @param key the key of the object to be parsed
* @return the parsed object
* @throws IOException if something went wrong when parsing the object
*/
protected COSBase parseObjectStreamObject(long objstmObjNr, COSObjectKey key) throws IOException
{
Map