Class AbstractProfiler

java.lang.Object
org.apache.tika.batch.FileResourceConsumer
org.apache.tika.eval.app.AbstractProfiler
All Implemented Interfaces:
Callable<IFileProcessorFutureResult>
Direct Known Subclasses:
ExtractComparer, ExtractProfiler, FileProfiler

public abstract class AbstractProfiler extends FileResourceConsumer
  • Field Details

    • TRUE

      public static final String TRUE
    • FALSE

      public static final String FALSE
    • ID

      protected static final AtomicInteger ID
    • REF_EXTRACT_EXCEPTION_TYPES

      public static TableInfo REF_EXTRACT_EXCEPTION_TYPES
    • REF_PARSE_ERROR_TYPES

      public static TableInfo REF_PARSE_ERROR_TYPES
    • REF_PARSE_EXCEPTION_TYPES

      public static TableInfo REF_PARSE_EXCEPTION_TYPES
    • MIME_TABLE

      public static TableInfo MIME_TABLE
    • writer

      protected IDBWriter writer
  • Constructor Details

  • Method Details

    • loadCommonTokens

      public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException
      Parameters:
      p - path to the common_tokens directory. If this is null, try to load from classPath
      defaultLangCode - this is the language code to use if a common_words list doesn't exist for the detected langauge; can be null
      Throws:
      IOException
    • truncateContent

      protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols,String> data)
      Get the content and record in the data Cols.CONTENT_TRUNCATED_AT_MAX_LEN whether the string was truncated
      Parameters:
      contentTags -
      maxLength -
      data -
      Returns:
    • getContent

      protected static ContentTags getContent(org.apache.tika.eval.app.EvalFilePaths evalFilePaths, Metadata metadata)
    • setMaxContentLength

      public void setMaxContentLength(int maxContentLength)
      Truncate the content string if greater than this length to this length
      Parameters:
      maxContentLength -
    • setMaxContentLengthForLangId

      public void setMaxContentLengthForLangId(int maxContentLengthForLangId)
      Truncate content string if greater than this length to this length for lang id
      Parameters:
      maxContentLengthForLangId -
    • setMaxTokens

      public void setMaxTokens(int maxTokens)
      Add a LimitTokenCountFilterFactory if > -1
      Parameters:
      maxTokens -
    • writeExtractException

      protected void writeExtractException(TableInfo extractExceptionTable, String containerId, String filePath, ExtractReaderException.TYPE type) throws IOException
      Throws:
      IOException
    • writeProfileData

      protected void writeProfileData(org.apache.tika.eval.app.EvalFilePaths fps, int i, ContentTags contentTags, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable)
    • writeExceptionData

      protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable)
    • calcTextStats

      protected Map<Class,Object> calcTextStats(ContentTags contentTags)
    • writeContentData

      protected void writeContentData(String fileId, Map<Class,Object> textStats, TableInfo contentsTable) throws IOException
      Checks to see if metadata is null or content is empty (null or only whitespace). If any of these, then this does no processing, and the fileId is not entered into the content table.
      Parameters:
      fileId -
      textStats -
      contentsTable -
      Throws:
      IOException
    • closeWriter

      public void closeWriter() throws IOException
      Throws:
      IOException
    • getPathsFromExtractCrawl

      protected org.apache.tika.eval.app.EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path extracts)
      Parameters:
      metadata -
      extracts -
      Returns:
      evalfilepaths for files if crawling an extract directory
    • getPathsFromSrcCrawl

      protected org.apache.tika.eval.app.EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, Path extracts)
    • getSourceFileLength

      protected long getSourceFileLength(org.apache.tika.eval.app.EvalFilePaths fps, List<Metadata> metadataList)
    • getFileLength

      protected long getFileLength(Path p)