Class AbstractProfiler

    • Field Detail

      • REF_EXTRACT_EXCEPTION_TYPES

        public static TableInfo REF_EXTRACT_EXCEPTION_TYPES
      • REF_PARSE_ERROR_TYPES

        public static TableInfo REF_PARSE_ERROR_TYPES
      • REF_PARSE_EXCEPTION_TYPES

        public static TableInfo REF_PARSE_EXCEPTION_TYPES
      • TRUE

        public static final String TRUE
      • FALSE

        public static final String FALSE
      • MIME_TABLE

        public static TableInfo MIME_TABLE
    • Method Detail

      • loadCommonTokens

        public static void loadCommonTokens​(Path p,
                                            String defaultLangCode)
                                     throws IOException
        Parameters:
        p - path to the common_tokens directory. If this is null, try to load from classPath
        defaultLangCode - this is the language code to use if a common_words list doesn't exist for the detected langauge; can be null
        Throws:
        IOException
      • setMaxContentLength

        public void setMaxContentLength​(int maxContentLength)
        Truncate the content string if greater than this length to this length
        Parameters:
        maxContentLength -
      • setMaxContentLengthForLangId

        public void setMaxContentLengthForLangId​(int maxContentLengthForLangId)
        Truncate content string if greater than this length to this length for lang id
        Parameters:
        maxContentLengthForLangId -
      • setMaxTokens

        public void setMaxTokens​(int maxTokens)
        Add a LimitTokenCountFilterFactory if > -1
        Parameters:
        maxTokens -
      • writeExceptionData

        protected void writeExceptionData​(String fileId,
                                          Metadata m,
                                          TableInfo exceptionTable)
      • writeContentData

        protected void writeContentData​(String fileId,
                                        Map<Class,​Object> textStats,
                                        TableInfo contentsTable)
                                 throws IOException
        Checks to see if metadata is null or content is empty (null or only whitespace). If any of these, then this does no processing, and the fileId is not entered into the content table.
        Parameters:
        fileId -
        textStats -
        contentsTable -
        Throws:
        IOException
      • getContent

        protected static ContentTags getContent​(org.apache.tika.eval.EvalFilePaths evalFilePaths,
                                                Metadata metadata)
      • getPathsFromExtractCrawl

        protected org.apache.tika.eval.EvalFilePaths getPathsFromExtractCrawl​(Metadata metadata,
                                                                              Path extracts)
        Parameters:
        metadata -
        extracts -
        Returns:
        evalfilepaths for files if crawling an extract directory
      • getPathsFromSrcCrawl

        protected org.apache.tika.eval.EvalFilePaths getPathsFromSrcCrawl​(Metadata metadata,
                                                                          Path srcDir,
                                                                          Path extracts)
      • getSourceFileLength

        protected long getSourceFileLength​(org.apache.tika.eval.EvalFilePaths fps,
                                           List<Metadata> metadataList)
      • getFileLength

        protected long getFileLength​(Path p)