public class PDFParserConfig extends Object implements Serializable
Modifier and Type | Class and Description |
---|---|
static class |
PDFParserConfig.OCR_RENDERING_STRATEGY |
static class |
PDFParserConfig.OCR_STRATEGY |
Constructor and Description |
---|
PDFParserConfig() |
Modifier and Type | Method and Description |
---|---|
PDFParserConfig |
cloneAndUpdate(PDFParserConfig updates) |
void |
configure(org.apache.tika.parser.pdf.PDF2XHTML pdf2XHTML)
Configures the given pdf2XHTML.
|
boolean |
equals(Object o) |
AccessChecker |
getAccessChecker() |
Float |
getAverageCharTolerance() |
Float |
getDropThreshold() |
long |
getMaxMainMemoryBytes()
The maximum amount of memory to use when loading a pdf into a PDDocument.
|
int |
getOcrDPI()
Dots per inch used to render the page image for OCR
|
String |
getOcrImageFormatName()
String representation of the image format used to render
the page image for OCR (examples: png, tiff, jpeg)
|
float |
getOcrImageQuality()
Image quality used to render the page image for OCR.
|
org.apache.pdfbox.rendering.ImageType |
getOcrImageType()
Image type used to render the page image for OCR.
|
PDFParserConfig.OCR_RENDERING_STRATEGY |
getOcrRenderingStrategy() |
PDFParserConfig.OCR_STRATEGY |
getOcrStrategy() |
Float |
getSpacingTolerance() |
int |
hashCode() |
boolean |
isCatchIntermediateIOExceptions()
|
boolean |
isDetectAngles() |
boolean |
isEnableAutoSpace() |
boolean |
isExtractAcroFormContent() |
boolean |
isExtractActions() |
boolean |
isExtractAnnotationText() |
boolean |
isExtractBookmarksText() |
boolean |
isExtractFontNames() |
boolean |
isExtractInlineImages() |
boolean |
isExtractMarkedContent() |
boolean |
isExtractUniqueInlineImagesOnly() |
boolean |
isIfXFAExtractOnlyXFA() |
boolean |
isSetKCMS() |
boolean |
isSortByPosition() |
boolean |
isSuppressDuplicateOverlappingText() |
void |
setAccessChecker(AccessChecker accessChecker) |
void |
setAverageCharTolerance(Float averageCharTolerance)
See
PDFTextStripper.setAverageCharTolerance(float) |
void |
setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions)
The PDFBox parser will throw an IOException if there is
a problem with a stream.
|
void |
setDetectAngles(boolean detectAngles) |
void |
setDropThreshold(Float dropThreshold)
See
PDFTextStripper.setDropThreshold(float) |
void |
setEnableAutoSpace(boolean enableAutoSpace)
If true (the default), the parser should estimate
where spaces should be inserted between words.
|
void |
setExtractAcroFormContent(boolean extractAcroFormContent)
If true (the default), extract content from AcroForms
at the end of the document.
|
void |
setExtractActions(boolean v)
Whether or not to extract PDActions from the file.
|
void |
setExtractAnnotationText(boolean extractAnnotationText)
If true (the default), text in annotations will be
extracted.
|
void |
setExtractBookmarksText(boolean extractBookmarksText)
If true, extract bookmarks (document outline) text.
|
void |
setExtractFontNames(boolean extractFontNames)
Extract font names into a metadata field
|
void |
setExtractInlineImages(boolean extractInlineImages)
If
true , extract the literal inline embedded OBXImages. |
void |
setExtractMarkedContent(boolean extractMarkedContent)
If the PDF contains marked content, try to extract text and its marked structure.
|
void |
setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly)
Multiple pages within a PDF file might refer to the same underlying image.
|
void |
setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA)
If false (the default), extract content from the full PDF
as well as the XFA form.
|
void |
setMaxMainMemoryBytes(long maxMainMemoryBytes) |
void |
setOcrDPI(int ocrDPI)
Dots per inch used to render the page image for OCR.
|
void |
setOcrImageFormatName(String ocrImageFormatName) |
void |
setOcrImageQuality(float ocrImageQuality)
Image quality used to render the page image for OCR.
|
void |
setOcrImageType(org.apache.pdfbox.rendering.ImageType ocrImageType)
Image type used to render the page image for OCR.
|
void |
setOcrImageType(String ocrImageTypeString)
Image type used to render the page image for OCR.
|
void |
setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY ocrRenderingStrategy)
When rendering the page for OCR, do you want to include the rendering of the electronic text,
ALL, or do you only want to run OCR on the images and vector graphics (NO_TEXT)?
|
void |
setOcrRenderingStrategy(String ocrRenderingStrategyString) |
void |
setOcrStrategy(PDFParserConfig.OCR_STRATEGY ocrStrategy)
Which strategy to use for OCR
|
void |
setOcrStrategy(String ocrStrategyString)
Which strategy to use for OCR
|
void |
setSetKCMS(boolean setKCMS)
Whether to call
System.setProperty("sun.java2d.cmm",
"sun.java2d.cmm.kcms.KcmsServiceProvider") . |
void |
setSortByPosition(boolean sortByPosition)
If true, sort text tokens by their x/y position
before extracting text.
|
void |
setSpacingTolerance(Float spacingTolerance)
See
PDFTextStripper.setSpacingTolerance(float) |
void |
setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
If true, the parser should try to remove duplicated
text over the same region.
|
String |
toString() |
public boolean isExtractMarkedContent()
public void setExtractMarkedContent(boolean extractMarkedContent)
extractMarkedContent
- public void configure(org.apache.tika.parser.pdf.PDF2XHTML pdf2XHTML)
pdf2XHTML
- public boolean isExtractAcroFormContent()
setExtractAcroFormContent(boolean)
public void setExtractAcroFormContent(boolean extractAcroFormContent)
extractAcroFormContent
- public boolean isIfXFAExtractOnlyXFA()
setIfXFAExtractOnlyXFA(boolean)
public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA)
ifXFAExtractOnlyXFA
- public boolean isExtractBookmarksText()
setExtractBookmarksText(boolean)
public void setExtractBookmarksText(boolean extractBookmarksText)
true
extractBookmarksText
- public boolean isExtractFontNames()
public void setExtractFontNames(boolean extractFontNames)
extractFontNames
- public boolean isExtractInlineImages()
setExtractInlineImages(boolean)
public void setExtractInlineImages(boolean extractInlineImages)
true
, extract the literal inline embedded OBXImages.
Beware: some PDF documents of modest size (~4MB) can contain
thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5,
there can be surprisingly large memory consumption and/or out of memory errors.
Along the same lines, note that this does not extract "logical" images. Some PDF writers
break up a single logical image into hundreds of little images. With this option set to
true
, you might get those hundreds of little images.
NOTE ALSO: this extracts the raw images without clipping, rotation, masks, color
inversion, etc. The images that this extracts may look nothing like what a human
would expect given the appearance of the PDF.
Set to true
only with the greatest caution.
The default is false
.
extractInlineImages
- setExtractUniqueInlineImagesOnly(boolean)
public boolean isExtractUniqueInlineImagesOnly()
public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly)
extractUniqueInlineImagesOnly
is set to false
, the
parser will call the EmbeddedExtractor each time the image appears on a page.
This might be desired for some use cases. However, to avoid duplication of
extracted images, set this to true
. The default is true
.
Note that uniqueness is determined only by the underlying PDF COSObject id, not by
file hash or similar equality metric.
If the PDF actually contains multiple copies of the same image
-- all with different object ids -- then all images will be extracted.
For this parameter to have any effect, extractInlineImages
must be
set to true
.
Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting of this parameter, the extractor will only pull out one copy of each image per page. This parameter tries to capture uniqueness across the entire document.
extractUniqueInlineImagesOnly
- public boolean isEnableAutoSpace()
setEnableAutoSpace(boolean)
public void setEnableAutoSpace(boolean enableAutoSpace)
public boolean isSuppressDuplicateOverlappingText()
public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
public boolean isExtractAnnotationText()
setExtractAnnotationText(boolean)
public void setExtractAnnotationText(boolean extractAnnotationText)
public boolean isSortByPosition()
setSortByPosition(boolean)
public void setSortByPosition(boolean sortByPosition)
public Float getAverageCharTolerance()
setAverageCharTolerance(Float)
public void setAverageCharTolerance(Float averageCharTolerance)
PDFTextStripper.setAverageCharTolerance(float)
public Float getSpacingTolerance()
setSpacingTolerance(Float)
public void setSpacingTolerance(Float spacingTolerance)
PDFTextStripper.setSpacingTolerance(float)
public Float getDropThreshold()
setDropThreshold(Float)
public void setDropThreshold(Float dropThreshold)
PDFTextStripper.setDropThreshold(float)
public AccessChecker getAccessChecker()
public void setAccessChecker(AccessChecker accessChecker)
public boolean isCatchIntermediateIOExceptions()
public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions)
true
,
Tika's PDFParser will catch these exceptions and try to parse
the rest of the document. After the parse is completed,
Tika's PDFParser will throw the first caught exception.catchIntermediateIOExceptions
- public PDFParserConfig.OCR_STRATEGY getOcrStrategy()
public void setOcrStrategy(PDFParserConfig.OCR_STRATEGY ocrStrategy)
ocrStrategy
- public void setOcrStrategy(String ocrStrategyString)
ocrStrategyString
- public PDFParserConfig.OCR_RENDERING_STRATEGY getOcrRenderingStrategy()
public void setOcrRenderingStrategy(String ocrRenderingStrategyString)
public void setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY ocrRenderingStrategy)
ocrRenderingStrategy
- public String getOcrImageFormatName()
public void setOcrImageFormatName(String ocrImageFormatName)
ocrImageFormatName
- name of image format used to render
page imagegetOcrImageFormatName()
public org.apache.pdfbox.rendering.ImageType getOcrImageType()
setOcrImageType(ImageType)
public void setOcrImageType(org.apache.pdfbox.rendering.ImageType ocrImageType)
ocrImageType
- public void setOcrImageType(String ocrImageTypeString)
setOcrImageType(ImageType)
public int getOcrDPI()
public void setOcrDPI(int ocrDPI)
ocrDPI
- public float getOcrImageQuality()
public void setOcrImageQuality(float ocrImageQuality)
public boolean isExtractActions()
setExtractActions(boolean)
public void setExtractActions(boolean v)
v
- public long getMaxMainMemoryBytes()
public void setMaxMainMemoryBytes(long maxMainMemoryBytes)
public boolean isSetKCMS()
public void setSetKCMS(boolean setKCMS)
Whether to call System.setProperty("sun.java2d.cmm",
"sun.java2d.cmm.kcms.KcmsServiceProvider")
.
KCMS is the unmaintained, legacy provider and is far faster than the newer replacement.
However, there are stability and security risks with using the unmaintained legacy provider.
Note, of course, that this is not thread safe. If the value is false
in your first thread, and the second thread changes this to true
,
the system property in the first thread will now be true
.
Default is false
.
setKCMS
- whether or not to set KCMSpublic boolean isDetectAngles()
public void setDetectAngles(boolean detectAngles)
public PDFParserConfig cloneAndUpdate(PDFParserConfig updates) throws TikaException
TikaException
Copyright © 2007–2021 The Apache Software Foundation. All rights reserved.