The following document contains the results of RAT (Release Audit Tool).
***************************************************** Summary ------- Notes: 4 Binaries: 13 Archives: 1 Standards: 118 Apache Licensed: 108 Generated Documents: 0 JavaDocs are generated and so license header is optional Generated files do not required license headers 10 Unknown Licenses ******************************* Archives (+ indicates readable, $ unreadable): + src/test/resources/test-documents/test-documents.zip ***************************************************** Files with AL headers will be marked L Binary files (which do not require AL headers) will be marked B Compressed archives will be marked A Notices, licenses etc will be marked N !????? .checkstyle !????? .externalToolBuilders/Maven_Ant_Builder.launch !????? CHANGES.txt AL HEADER.txt N KEYS N LICENSE.txt !????? maven-eclipse.xml N NOTICE.txt AL pom.xml N README.txt AL src/main/assembly/standalone.xml AL src/main/java/org/apache/tika/cli/TikaCLI.java AL src/main/java/org/apache/tika/config/TikaConfig.java AL src/main/java/org/apache/tika/exception/TikaException.java AL src/main/java/org/apache/tika/gui/ParsingTransferHandler.java AL src/main/java/org/apache/tika/gui/TikaGUI.java AL src/main/java/org/apache/tika/metadata/CreativeCommons.java AL src/main/java/org/apache/tika/metadata/DublinCore.java AL src/main/java/org/apache/tika/metadata/HttpHeaders.java AL src/main/java/org/apache/tika/metadata/Metadata.java AL src/main/java/org/apache/tika/metadata/MSOffice.java AL src/main/java/org/apache/tika/metadata/package.html AL src/main/java/org/apache/tika/metadata/SpellCheckedMetadata.java AL src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java AL src/main/java/org/apache/tika/metadata/TikaMimeKeys.java AL src/main/java/org/apache/tika/mime/Clause.java AL src/main/java/org/apache/tika/mime/HexCoDec.java AL src/main/java/org/apache/tika/mime/Magic.java AL src/main/java/org/apache/tika/mime/MagicClause.java AL src/main/java/org/apache/tika/mime/MagicMatch.java AL src/main/java/org/apache/tika/mime/MediaType.java AL src/main/java/org/apache/tika/mime/MediaTypeRegistry.java AL src/main/java/org/apache/tika/mime/MimeType.java AL src/main/java/org/apache/tika/mime/MimeTypeException.java AL src/main/java/org/apache/tika/mime/MimeTypes.java AL src/main/java/org/apache/tika/mime/MimeTypesFactory.java AL src/main/java/org/apache/tika/mime/MimeTypesReader.java AL src/main/java/org/apache/tika/mime/Operator.java AL src/main/java/org/apache/tika/mime/Patterns.java AL src/main/java/org/apache/tika/parser/AbstractParser.java AL src/main/java/org/apache/tika/parser/AutoDetectParser.java AL src/main/java/org/apache/tika/parser/CompositeParser.java AL src/main/java/org/apache/tika/parser/EmptyParser.java AL src/main/java/org/apache/tika/parser/ErrorParser.java AL src/main/java/org/apache/tika/parser/html/HtmlParser.java !????? src/main/java/org/apache/tika/parser/image/ImageParser.java AL src/main/java/org/apache/tika/parser/microsoft/Cell.java AL src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java AL src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java AL src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java AL src/main/java/org/apache/tika/parser/microsoft/NumberCell.java AL src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java AL src/main/java/org/apache/tika/parser/microsoft/TextCell.java AL src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java AL src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java AL src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java AL src/main/java/org/apache/tika/parser/Parser.java AL src/main/java/org/apache/tika/parser/ParserDecorator.java AL src/main/java/org/apache/tika/parser/ParserPostProcessor.java AL src/main/java/org/apache/tika/parser/ParsingReader.java AL src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java AL src/main/java/org/apache/tika/parser/pdf/PDFParser.java AL src/main/java/org/apache/tika/parser/rtf/RTFParser.java AL src/main/java/org/apache/tika/parser/txt/TXTParser.java AL src/main/java/org/apache/tika/parser/xml/DcXMLParser.java AL src/main/java/org/apache/tika/parser/xml/MetadataHandler.java AL src/main/java/org/apache/tika/parser/xml/XMLParser.java AL src/main/java/org/apache/tika/sax/BodyContentHandler.java AL src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java AL src/main/java/org/apache/tika/sax/TeeContentHandler.java AL src/main/java/org/apache/tika/sax/TextContentHandler.java AL src/main/java/org/apache/tika/sax/WriteOutContentHandler.java AL src/main/java/org/apache/tika/sax/XHTMLContentHandler.java AL src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java AL src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java AL src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java AL src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java AL src/main/java/org/apache/tika/sax/xpath/Matcher.java AL src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java AL src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java AL src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java AL src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java AL src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java AL src/main/java/org/apache/tika/sax/xpath/TextMatcher.java AL src/main/java/org/apache/tika/sax/xpath/XPathParser.java AL src/main/java/org/apache/tika/utils/ParseUtils.java AL src/main/java/org/apache/tika/utils/RegexUtils.java AL src/main/java/org/apache/tika/utils/RereadableInputStream.java AL src/main/java/org/apache/tika/utils/StringUtil.java AL src/main/java/org/apache/tika/utils/Utils.java AL src/main/resources/mime/tika-mimetypes.xml AL src/main/resources/tika-config.xml AL src/site/apt/download.apt AL src/site/apt/index.apt B src/site/resources/tika.png B src/site/resources/tika.xcf AL src/site/site.xml AL src/test/java/org/apache/tika/metadata/TestMetadata.java AL src/test/java/org/apache/tika/metadata/TestSpellCheckedMetadata.java AL src/test/java/org/apache/tika/mime/MediaTypeTest.java AL src/test/java/org/apache/tika/mime/MimeTypesTest.java AL src/test/java/org/apache/tika/mime/MimeTypeTest.java AL src/test/java/org/apache/tika/mime/PatternsTest.java AL src/test/java/org/apache/tika/mime/TestMimeTypes.java AL src/test/java/org/apache/tika/parser/AutoDetectParserTest.java AL src/test/java/org/apache/tika/parser/html/HtmlParserTest.java AL src/test/java/org/apache/tika/parser/image/ImageParserTest.java AL src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java AL src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java AL src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java AL src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java AL src/test/java/org/apache/tika/parser/ParsingReaderTest.java AL src/test/java/org/apache/tika/parser/txt/TXTParserTest.java AL src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java AL src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java AL src/test/java/org/apache/tika/TestParsers.java AL src/test/java/org/apache/tika/TestRereadableInputStream.java AL src/test/java/org/apache/tika/utils/RegexUtilsTest.java AL src/test/resources/log4j.properties A src/test/resources/test-documents/test-documents.zip B src/test/resources/test-documents/testBMP.bmp B src/test/resources/test-documents/testEXCEL-formats.xls B src/test/resources/test-documents/testEXCEL.xls B src/test/resources/test-documents/testGIF.gif !????? src/test/resources/test-documents/testHTML.html !????? src/test/resources/test-documents/testHTML_utf8.html B src/test/resources/test-documents/testJPEG.jpg B src/test/resources/test-documents/testOpenOffice2.odt B src/test/resources/test-documents/testPDF.pdf B src/test/resources/test-documents/testPNG.png B src/test/resources/test-documents/testPPT.ppt !????? src/test/resources/test-documents/testRTF.rtf B src/test/resources/test-documents/testTIFF.tif !????? src/test/resources/test-documents/testTXT.txt B src/test/resources/test-documents/testWORD.doc !????? src/test/resources/test-documents/testXML.xml ***************************************************** Printing headers for files without AL header... ======================================================================= ==.checkstyle ======================================================================= <?xml version="1.0" encoding="UTF-8"?> <fileset-config file-format-version="1.2.0" simple-config="true"> <fileset name="all" enabled="true" check-config-name="Sun Checks" local="false"> <file-match-pattern match-pattern="." include-pattern="true"/> </fileset> </fileset-config> ======================================================================= ==.externalToolBuilders/Maven_Ant_Builder.launch ======================================================================= <launchConfiguration type="org.eclipse.ant.AntBuilderLaunchConfigurationType"> <booleanAttribute key="org.eclipse.debug.ui.ATTR_LAUNCH_IN_BACKGROUND" value="false"/> <stringAttribute key="org.eclipse.ui.externaltools.ATTR_RUN_BUILD_KINDS" value="full,incremental,auto,clean"/> <booleanAttribute key="org.eclipse.ui.externaltools.ATTR_TRIGGERS_CONFIGURED" value="true"/> <booleanAttribute key="org.eclipse.debug.core.appendEnvironmentVariables" value="true"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="tika"/> <booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="true"/> <stringAttribute key="org.eclipse.ui.externaltools.ATTR_LOCATION" value="${build_project}/maven-eclipse.xml"/> <stringAttribute key="org.eclipse.ui.externaltools.ATTR_WORKING_DIRECTORY" value="${build_project}"/> <stringAttribute key="org.eclipse.debug.core.ATTR_REFRESH_SCOPE" value="${project}"/> <booleanAttribute key="org.eclipse.debug.core.capture_output" value="false"/> <stringAttribute key="org.eclipse.ui.externaltools.ATTR_BUILD_SCOPE" value="${working_set:<?xml version='1.0'?><launchConfigurationWorkingSet editPageId='org.eclipse.ui.resourceWorkingSetPage' factoryID='org.eclipse.ui.internal.WorkingSetFactory' label='workingSet' name='workingSet'><item factoryID='org.eclipse.ui.internal.model.ResourceFactory' path='tika' type='4'/></launchConfigurationWorkingSet>}"/> <stringAttribute key="process_factory_id" value="org.eclipse.ant.ui.remoteAntProcessFactory"/> <booleanAttribute key="org.eclipse.ant.ui.DEFAULT_VM_INSTALL" value="false"/> <booleanAttribute key="org.eclipse.debug.ui.ATTR_CONSOLE_OUTPUT_ON" value="false"/> <booleanAttribute key="org.eclipse.ant.ui.ATTR_TARGETS_UPDATED" value="true"/> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.ant.ui.AntClasspathProvider"/> <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES"> <listEntry value="1"/> </listAttribute> <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS"> <listEntry value="/tika/maven-eclipse.xml"/> </listAttribute> </launchConfiguration> ======================================================================= ==CHANGES.txt ======================================================================= Tika Change Log Unreleased changes (0.2-incubating) 1. TIKA-109 - WordParser fails on some Word files (Dave Meikle) 2. TIKA-105 - Excel parser implementation based on POI's Event API (Niall Pemberton) 3. TIKA-116 - Streaming parser for OpenDocument files (Jukka Zitting) 4. TIKA-117 - Drop JDOM and Jaxen dependencies (Jukka Zitting) 5. TIKA-115 - Tika package with all the dependencies (Jukka Zitting) 6. TIKA-97 - Tika GUI (Jukka Zitting) 7. TIKA-96 - Tika CLI (Jukka Zitting) 8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting) 9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction (Jukka Zitting) 10. TIKA-127 - Add support for Visio files (Jukka Zitting) 11. TIKA-129 - node() support for the streaming XPath utility (Jukka Zitting) 12. TIKA-130 - self-or-descendant axis does not match self in streaming XPath (Jukka Zitting) 13. TIKA-131 - Lazy XHTML prefix generation (Jukka Zitting) 14. TIKA-128 - HTML parser should produce XHTML SAX events (Jukka Zitting) 15. TIKA-133 - TeeContentHandler constructor should use varargs (Jukka Zitting) 16. TIKA-132 - Refactor Excel extractor to parse per sheet and add hyperlink support (Niall Pemberton) 17. TIKA-134 - mvn package does not produce packages for bin/src (Karl Heinz Marbaise) 18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting) 19. TIKA-113 - Metadata (such as title) should not be part of content (Jukka Zitting) 20. TIKA-139 - Add a composite parser (Jukka Zitting) ======================================================================= ==maven-eclipse.xml ======================================================================= <project default="copy-resources"> <target name="init"/> <target name="copy-resources" depends="init"> <copy todir="target/classes/META-INF" filtering="false"> <fileset dir="." includes="README.txt|NOTICE.txt|LICENSE.txt"/> </copy> <copy todir="target/classes/org/apache/tika" filtering="false"> <fileset dir="src/main/resources"/> </copy> </target> </project> ======================================================================= ==src/main/java/org/apache/tika/parser/image/ImageParser.java ======================================================================= package org.apache.tika.parser.image; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import javax.imageio.ImageIO; import javax.imageio.ImageReader; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class ImageParser implements Parser { public void parse(InputStream stream, Metadata metadata) throws IOException, TikaException { String type = metadata.get(Metadata.CONTENT_TYPE); if (type != null) { Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type); if (iterator.hasNext()) { ImageReader reader = iterator.next(); reader.setInput(ImageIO.createImageInputStream( new CloseShieldInputStream(stream))); metadata.set("height", Integer.toString(reader.getHeight(0))); metadata.set("width", Integer.toString(reader.getWidth(0))); reader.dispose(); } } } public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { parse(stream, metadata); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); } } ======================================================================= ==src/test/resources/test-documents/testHTML.html ======================================================================= <html> <head> <title>Title : Test Indexation Html</title> </head> <body> <h1>Test Indexation Html</h1> <p><a href="http://www.apache.org/">Indexation</a> du fichier</p> </body> </html> ======================================================================= ==src/test/resources/test-documents/testHTML_utf8.html ======================================================================= <html> <head> <title>Title : Tilte with UTF-8 chars öäå</title> </head> <body> <h1>Content with UTF-8 chars</h1> <p>åäö</p> </body> </html> ======================================================================= ==src/test/resources/test-documents/testRTF.rtf ======================================================================= {\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1036\deflangfe1036{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f37\froman\fcharset238\fprq2 Times New Roman CE;} {\f38\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman Tur;}{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);} {\f43\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255; \red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0; \red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\rsidtbl \rsid2954171\rsid10375891} {\*\generator Microsoft Word 11.0.6568;}{\info{\title Test d\'92indexation Word}{\author Bibliotheque}{\operator Bibliotheque}{\creatim\yr2006\mo5\dy18\hr12\min19}{\revtim\yr2006\mo5\dy18\hr12\min19}{\version2}{\edmins0}{\nofpages1}{\nofwords3} {\nofchars21}{\*\company Universite Laval}{\nofcharsws23}{\vern24579}}\paperw11906\paperh16838\margl1417\margr1417\margt1417\margb1417 \deftab708\widowctrl\ftnbj\aenddoc\hyphhotz425\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1417\dgvorigin1417\dghshow1\dgvshow1 \jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct\asianbrkrule\nojkernpunct\rsidroot2954171 \fet0 \sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3 \pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}} {\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid2954171 Test d\rquote indexation Word \par \par }} ======================================================================= ==src/test/resources/test-documents/testTXT.txt ======================================================================= Test d'indexation de Txt http://www.apache.org ======================================================================= ==src/test/resources/test-documents/testXML.xml ======================================================================= <?xml version="1.0" encoding="UTF-8"?> <oaidc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/"> <dc:title>Tika test document</dc:title> <dc:creator>Rida Benjelloun</dc:creator> <dc:subject>Java</dc:subject> <dc:subject>XML</dc:subject> <dc:subject>XSLT</dc:subject> <dc:subject>JDOM</dc:subject> <dc:subject>Indexation</dc:subject> <dc:description>Framework d'indexation des documents XML, HTML, PDF etc.. </dc:description> <dc:identifier>http://www.apache.org</dc:identifier> <dc:date>2000-12</dc:date> <dc:type>test</dc:type> <dc:format>application/msword</dc:format> <dc:language>Fr</dc:language> <dc:rights>Archimède et Lius à Châteauneuf testing chars en été</dc:rights> </oaidc:dc>