Package org.apache.tika.eval
Class AbstractProfiler
- java.lang.Object
-
- org.apache.tika.batch.FileResourceConsumer
-
- org.apache.tika.eval.AbstractProfiler
-
- All Implemented Interfaces:
Callable<IFileProcessorFutureResult>
- Direct Known Subclasses:
ExtractComparer
,ExtractProfiler
public abstract class AbstractProfiler extends FileResourceConsumer
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static class
AbstractProfiler.EXCEPTION_TYPE
static class
AbstractProfiler.PARSE_ERROR_TYPE
If information was gathered from the log file about a parse error
-
Field Summary
Fields Modifier and Type Field Description static String
FALSE
protected static AtomicInteger
ID
static TableInfo
MIME_TABLE
static TableInfo
REF_EXTRACT_EXCEPTION_TYPES
static TableInfo
REF_PARSE_ERROR_TYPES
static TableInfo
REF_PARSE_EXCEPTION_TYPES
static String
TRUE
protected IDBWriter
writer
-
Fields inherited from class org.apache.tika.batch.FileResourceConsumer
ELAPSED_MILLIS, IO_IS, IO_OS, OOM, PARSE_ERR, PARSE_EX, TIMED_OUT
-
-
Constructor Summary
Constructors Constructor Description AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue, IDBWriter writer)
-
Method Summary
All Methods Static Methods Instance Methods Concrete Methods Modifier and Type Method Description protected Map<Class,Object>
calcTextStats(ContentTags contentTags)
void
closeWriter()
protected static ContentTags
getContent(org.apache.tika.eval.EvalFilePaths evalFilePaths, Metadata metadata)
protected long
getFileLength(Path p)
protected org.apache.tika.eval.EvalFilePaths
getPathsFromExtractCrawl(Metadata metadata, Path extracts)
protected org.apache.tika.eval.EvalFilePaths
getPathsFromSrcCrawl(Metadata metadata, Path srcDir, Path extracts)
protected long
getSourceFileLength(org.apache.tika.eval.EvalFilePaths fps, List<Metadata> metadataList)
static void
loadCommonTokens(Path p, String defaultLangCode)
void
setMaxContentLength(int maxContentLength)
Truncate the content string if greater than this length to this lengthvoid
setMaxContentLengthForLangId(int maxContentLengthForLangId)
Truncate content string if greater than this length to this length for lang idvoid
setMaxTokens(int maxTokens)
Add a LimitTokenCountFilterFactory if > -1protected static String
truncateContent(ContentTags contentTags, int maxLength, Map<Cols,String> data)
Get the content and record in the dataCols.CONTENT_TRUNCATED_AT_MAX_LEN
whether the string was truncatedprotected void
writeContentData(String fileId, Map<Class,Object> textStats, TableInfo contentsTable)
Checks to see if metadata is null or content is empty (null or only whitespace).protected void
writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable)
protected void
writeExtractException(TableInfo extractExceptionTable, String containerId, String filePath, ExtractReaderException.TYPE type)
protected void
writeProfileData(org.apache.tika.eval.EvalFilePaths fps, int i, ContentTags contentTags, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable)
-
Methods inherited from class org.apache.tika.batch.FileResourceConsumer
call, checkForTimedOutMillis, close, flushAndClose, getCurrentFile, getNumHandledExceptions, getNumResourcesConsumed, getXMLifiedLogMsg, getXMLifiedLogMsg, incrementHandledExceptions, isStillActive, parse, pleaseShutdown, processFileResource
-
-
-
-
Field Detail
-
REF_EXTRACT_EXCEPTION_TYPES
public static TableInfo REF_EXTRACT_EXCEPTION_TYPES
-
REF_PARSE_ERROR_TYPES
public static TableInfo REF_PARSE_ERROR_TYPES
-
REF_PARSE_EXCEPTION_TYPES
public static TableInfo REF_PARSE_EXCEPTION_TYPES
-
TRUE
public static final String TRUE
-
FALSE
public static final String FALSE
-
ID
protected static final AtomicInteger ID
-
MIME_TABLE
public static TableInfo MIME_TABLE
-
writer
protected IDBWriter writer
-
-
Constructor Detail
-
AbstractProfiler
public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue, IDBWriter writer)
-
-
Method Detail
-
loadCommonTokens
public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException
- Parameters:
p
- path to the common_tokens directory. If this is null, try to load from classPathdefaultLangCode
- this is the language code to use if a common_words list doesn't exist for the detected langauge; can benull
- Throws:
IOException
-
setMaxContentLength
public void setMaxContentLength(int maxContentLength)
Truncate the content string if greater than this length to this length- Parameters:
maxContentLength
-
-
setMaxContentLengthForLangId
public void setMaxContentLengthForLangId(int maxContentLengthForLangId)
Truncate content string if greater than this length to this length for lang id- Parameters:
maxContentLengthForLangId
-
-
setMaxTokens
public void setMaxTokens(int maxTokens)
Add a LimitTokenCountFilterFactory if > -1- Parameters:
maxTokens
-
-
writeExtractException
protected void writeExtractException(TableInfo extractExceptionTable, String containerId, String filePath, ExtractReaderException.TYPE type) throws IOException
- Throws:
IOException
-
writeProfileData
protected void writeProfileData(org.apache.tika.eval.EvalFilePaths fps, int i, ContentTags contentTags, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable)
-
writeExceptionData
protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable)
-
calcTextStats
protected Map<Class,Object> calcTextStats(ContentTags contentTags)
-
writeContentData
protected void writeContentData(String fileId, Map<Class,Object> textStats, TableInfo contentsTable) throws IOException
Checks to see if metadata is null or content is empty (null or only whitespace). If any of these, then this does no processing, and the fileId is not entered into the content table.- Parameters:
fileId
-textStats
-contentsTable
-- Throws:
IOException
-
truncateContent
protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols,String> data)
Get the content and record in the dataCols.CONTENT_TRUNCATED_AT_MAX_LEN
whether the string was truncated- Parameters:
contentTags
-maxLength
-data
-- Returns:
-
getContent
protected static ContentTags getContent(org.apache.tika.eval.EvalFilePaths evalFilePaths, Metadata metadata)
-
closeWriter
public void closeWriter() throws IOException
- Throws:
IOException
-
getPathsFromExtractCrawl
protected org.apache.tika.eval.EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path extracts)
- Parameters:
metadata
-extracts
-- Returns:
- evalfilepaths for files if crawling an extract directory
-
getPathsFromSrcCrawl
protected org.apache.tika.eval.EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, Path extracts)
-
getSourceFileLength
protected long getSourceFileLength(org.apache.tika.eval.EvalFilePaths fps, List<Metadata> metadataList)
-
getFileLength
protected long getFileLength(Path p)
-
-