1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.mime; |
19 | |
|
20 | |
import org.apache.any23.extractor.csv.CSVReaderBuilder; |
21 | |
import org.apache.any23.mime.purifier.Purifier; |
22 | |
import org.apache.any23.mime.purifier.WhiteSpacesPurifier; |
23 | |
import org.apache.tika.Tika; |
24 | |
import org.apache.tika.config.TikaConfig; |
25 | |
import org.apache.tika.metadata.Metadata; |
26 | |
import org.apache.tika.mime.MimeType; |
27 | |
import org.apache.tika.mime.MimeTypeException; |
28 | |
import org.apache.tika.mime.MimeTypes; |
29 | |
import org.openrdf.rio.RDFParser; |
30 | |
import org.openrdf.rio.turtle.TurtleParser; |
31 | |
|
32 | |
import java.io.BufferedReader; |
33 | |
import java.io.ByteArrayInputStream; |
34 | |
import java.io.IOException; |
35 | |
import java.io.InputStream; |
36 | |
import java.io.InputStreamReader; |
37 | |
import java.util.regex.Pattern; |
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
public class TikaMIMETypeDetector implements MIMETypeDetector { |
47 | |
|
48 | |
private Purifier purifier; |
49 | |
|
50 | |
|
51 | |
|
52 | |
public static final String N3_MIMETYPE = "text/n3"; |
53 | |
|
54 | |
public static final String NQUADS_MIMETYPE = "text/nq"; |
55 | |
|
56 | |
public static final String TURTLE_MIMETYPE = "application/turtle"; |
57 | |
|
58 | |
public static final String CSV_MIMETYPE = "text/csv"; |
59 | |
|
60 | |
public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml"; |
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | 0 | private static final Pattern[] N3_PATTERNS = { |
66 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\." ), |
67 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\." ), |
68 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\." ), |
69 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.") |
70 | |
}; |
71 | |
|
72 | |
|
73 | |
|
74 | |
|
75 | 0 | private static final Pattern[] NQUADS_PATTERNS = { |
76 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\." ), |
77 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\." ), |
78 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\." ), |
79 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.") |
80 | |
}; |
81 | |
|
82 | 0 | private static TikaConfig config = null; |
83 | |
|
84 | |
private static Tika tika; |
85 | |
|
86 | |
private static MimeTypes types; |
87 | |
|
88 | |
|
89 | |
|
90 | |
|
91 | |
|
92 | |
|
93 | |
|
94 | |
|
95 | |
public static boolean checkN3Format(InputStream is) throws IOException { |
96 | 0 | return findPattern(N3_PATTERNS, '.', is); |
97 | |
} |
98 | |
|
99 | |
|
100 | |
|
101 | |
|
102 | |
|
103 | |
|
104 | |
|
105 | |
|
106 | |
public static boolean checkNQuadsFormat(InputStream is) throws IOException { |
107 | 0 | return findPattern(NQUADS_PATTERNS, '.', is); |
108 | |
} |
109 | |
|
110 | |
|
111 | |
|
112 | |
|
113 | |
|
114 | |
|
115 | |
|
116 | |
|
117 | |
public static boolean checkTurtleFormat(InputStream is) throws IOException { |
118 | 0 | String sample = extractDataSample(is, '.'); |
119 | 0 | TurtleParser turtleParser = new TurtleParser(); |
120 | 0 | turtleParser.setDatatypeHandling(RDFParser.DatatypeHandling.VERIFY); |
121 | 0 | turtleParser.setStopAtFirstError(true); |
122 | 0 | turtleParser.setVerifyData(true); |
123 | 0 | ByteArrayInputStream bais = new ByteArrayInputStream( sample.getBytes() ); |
124 | |
try { |
125 | 0 | turtleParser.parse(bais, ""); |
126 | 0 | return true; |
127 | 0 | } catch (Exception e) { |
128 | 0 | return false; |
129 | |
} |
130 | |
} |
131 | |
|
132 | |
|
133 | |
|
134 | |
|
135 | |
|
136 | |
|
137 | |
|
138 | |
|
139 | |
public static boolean checkCSVFormat(InputStream is) throws IOException { |
140 | 0 | return CSVReaderBuilder.isCSV(is); |
141 | |
} |
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | |
|
147 | |
|
148 | |
|
149 | |
|
150 | |
|
151 | |
|
152 | |
private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is) |
153 | |
throws IOException { |
154 | 0 | String sample = extractDataSample(is, delimiterChar); |
155 | 0 | for(Pattern pattern : patterns) { |
156 | 0 | if(pattern.matcher(sample).find()) { |
157 | 0 | return true; |
158 | |
} |
159 | |
} |
160 | 0 | return false; |
161 | |
} |
162 | |
|
163 | |
|
164 | |
|
165 | |
|
166 | |
|
167 | |
|
168 | |
|
169 | |
|
170 | |
|
171 | |
|
172 | |
private static String extractDataSample(InputStream is, char breakChar) throws IOException { |
173 | 0 | BufferedReader br = new BufferedReader(new InputStreamReader(is)); |
174 | 0 | StringBuilder sb = new StringBuilder(); |
175 | 0 | final int MAX_SIZE = 1024 * 2; |
176 | |
int c; |
177 | 0 | boolean insideBlock = false; |
178 | 0 | int read = 0; |
179 | 0 | br.mark(MAX_SIZE); |
180 | |
try { |
181 | 0 | while ((c = br.read()) != -1) { |
182 | 0 | read++; |
183 | 0 | if (read > MAX_SIZE) { |
184 | 0 | break; |
185 | |
} |
186 | 0 | if ('<' == c) { |
187 | 0 | insideBlock = true; |
188 | 0 | } else if ('>' == c) { |
189 | 0 | insideBlock = false; |
190 | 0 | } else if ('"' == c) { |
191 | 0 | insideBlock = !insideBlock; |
192 | |
} |
193 | 0 | sb.append((char) c); |
194 | 0 | if (!insideBlock && breakChar == c) { |
195 | 0 | break; |
196 | |
} |
197 | |
} |
198 | |
} finally { |
199 | 0 | is.reset(); |
200 | 0 | br.reset(); |
201 | 0 | } |
202 | 0 | return sb.toString(); |
203 | |
} |
204 | |
|
205 | 0 | public TikaMIMETypeDetector(Purifier purifier) { |
206 | 0 | this.purifier = purifier; |
207 | 0 | InputStream is = getResourceAsStream(); |
208 | 0 | if (config == null) { |
209 | |
try { |
210 | 0 | config = new TikaConfig(is); |
211 | 0 | } catch (Exception e) { |
212 | 0 | throw new RuntimeException("Error while loading Tika configuration.", e); |
213 | 0 | } |
214 | |
} |
215 | |
|
216 | 0 | if (types == null) { |
217 | 0 | types = config.getMimeRepository(); |
218 | |
} |
219 | |
|
220 | 0 | if(tika == null) { |
221 | 0 | tika = new Tika(config); |
222 | |
} |
223 | 0 | } |
224 | |
|
225 | |
public TikaMIMETypeDetector() { |
226 | 0 | this( new WhiteSpacesPurifier() ); |
227 | 0 | } |
228 | |
|
229 | |
|
230 | |
|
231 | |
|
232 | |
|
233 | |
|
234 | |
|
235 | |
|
236 | |
|
237 | |
|
238 | |
|
239 | |
public MIMEType guessMIMEType( |
240 | |
String fileName, |
241 | |
InputStream input, |
242 | |
MIMEType mimeTypeFromMetadata |
243 | |
) { |
244 | 0 | if(input != null) { |
245 | |
try { |
246 | 0 | this.purifier.purify(input); |
247 | 0 | } catch (IOException e) { |
248 | 0 | throw new RuntimeException("Error while purifying the provided input", e); |
249 | 0 | } |
250 | |
} |
251 | |
|
252 | 0 | final Metadata meta = new Metadata(); |
253 | 0 | if (mimeTypeFromMetadata != null) |
254 | 0 | meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType()); |
255 | 0 | if (fileName != null) |
256 | 0 | meta.set(Metadata.RESOURCE_NAME_KEY, fileName); |
257 | |
|
258 | |
String type; |
259 | |
try { |
260 | 0 | final String mt = guessMimeTypeByInputAndMeta(input, meta); |
261 | 0 | if( ! MimeTypes.OCTET_STREAM.equals(mt) ) { |
262 | 0 | type = mt; |
263 | |
} else { |
264 | 0 | if( checkN3Format(input) ) { |
265 | 0 | type = N3_MIMETYPE; |
266 | 0 | } else if( checkNQuadsFormat(input) ) { |
267 | 0 | type = NQUADS_MIMETYPE; |
268 | 0 | } else if( checkTurtleFormat(input) ) { |
269 | 0 | type = TURTLE_MIMETYPE; |
270 | 0 | } else if( checkCSVFormat(input) ) { |
271 | 0 | type = CSV_MIMETYPE; |
272 | |
} |
273 | |
else { |
274 | 0 | type = MimeTypes.OCTET_STREAM; |
275 | |
} |
276 | |
} |
277 | 0 | } catch (IOException ioe) { |
278 | 0 | throw new RuntimeException("Error while retrieving mime type.", ioe); |
279 | 0 | } |
280 | 0 | return MIMEType.parse(type); |
281 | |
} |
282 | |
|
283 | |
|
284 | |
|
285 | |
|
286 | |
|
287 | |
|
288 | |
private InputStream getResourceAsStream() { |
289 | |
InputStream result; |
290 | 0 | result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME); |
291 | 0 | if (result == null) { |
292 | 0 | result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME); |
293 | 0 | if (result == null) { |
294 | 0 | result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME); |
295 | |
} |
296 | |
} |
297 | 0 | return result; |
298 | |
} |
299 | |
|
300 | |
|
301 | |
|
302 | |
|
303 | |
|
304 | |
|
305 | |
|
306 | |
|
307 | |
|
308 | |
|
309 | |
|
310 | |
|
311 | |
|
312 | |
|
313 | |
private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) |
314 | |
throws IOException { |
315 | 0 | if (stream != null) { |
316 | 0 | final String type = tika.detect(stream); |
317 | 0 | if ( type != null && ! isGenericMIMEType(type) ) { |
318 | 0 | return type; |
319 | |
} |
320 | |
} |
321 | |
|
322 | |
|
323 | 0 | final String contentType = metadata.get(Metadata.CONTENT_TYPE); |
324 | 0 | String candidateMIMEType = null; |
325 | 0 | if (contentType != null) { |
326 | |
try { |
327 | 0 | MimeType type = types.forName(contentType); |
328 | 0 | if (type != null) { |
329 | 0 | if( ! isPlainMIMEType(type.getName()) ) { |
330 | 0 | return type.getName(); |
331 | |
} else { |
332 | 0 | candidateMIMEType = type.getName(); |
333 | |
} |
334 | |
} |
335 | |
} |
336 | 0 | catch (MimeTypeException mte) { |
337 | |
|
338 | 0 | } |
339 | |
} |
340 | |
|
341 | |
|
342 | 0 | final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); |
343 | 0 | if (resourceName != null) { |
344 | 0 | MimeType type = types.getMimeType(resourceName); |
345 | 0 | if (type != null) { |
346 | 0 | return type.getName(); |
347 | |
} |
348 | |
} |
349 | |
|
350 | |
|
351 | 0 | if(candidateMIMEType != null) { |
352 | 0 | return candidateMIMEType; |
353 | |
} else { |
354 | 0 | return MimeTypes.OCTET_STREAM; |
355 | |
} |
356 | |
} |
357 | |
|
358 | |
private boolean isPlainMIMEType(String type) { |
359 | 0 | return |
360 | |
type.equals(MimeTypes.OCTET_STREAM) |
361 | |
|| |
362 | |
type.equals(MimeTypes.PLAIN_TEXT); |
363 | |
} |
364 | |
|
365 | |
private boolean isGenericMIMEType(String type) { |
366 | 0 | return |
367 | |
isPlainMIMEType(type) |
368 | |
|| |
369 | |
type.equals(MimeTypes.XML); |
370 | |
} |
371 | |
|
372 | |
} |
373 | |
|