1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.cli; |
19 | |
|
20 | |
import org.apache.any23.Any23; |
21 | |
import org.apache.any23.configuration.Configuration; |
22 | |
import org.apache.any23.configuration.DefaultConfiguration; |
23 | |
import org.apache.any23.extractor.ExtractionException; |
24 | |
import org.apache.any23.extractor.ExtractionParameters; |
25 | |
import org.apache.any23.extractor.SingleDocumentExtraction; |
26 | |
import org.apache.any23.filter.IgnoreAccidentalRDFa; |
27 | |
import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; |
28 | |
import org.apache.any23.source.DocumentSource; |
29 | |
import org.apache.any23.util.LogUtils; |
30 | |
import org.apache.any23.writer.BenchmarkTripleHandler; |
31 | |
import org.apache.any23.writer.LoggingTripleHandler; |
32 | |
import org.apache.any23.writer.ReportingTripleHandler; |
33 | |
import org.apache.any23.writer.TripleHandler; |
34 | |
import org.apache.any23.writer.TripleHandlerException; |
35 | |
import org.apache.any23.writer.WriterRegistry; |
36 | |
import org.apache.commons.cli.CommandLine; |
37 | |
import org.apache.commons.cli.CommandLineParser; |
38 | |
import org.apache.commons.cli.HelpFormatter; |
39 | |
import org.apache.commons.cli.Option; |
40 | |
import org.apache.commons.cli.Options; |
41 | |
import org.apache.commons.cli.PosixParser; |
42 | |
import org.slf4j.Logger; |
43 | |
import org.slf4j.LoggerFactory; |
44 | |
|
45 | |
import java.io.File; |
46 | |
import java.io.FileNotFoundException; |
47 | |
import java.io.IOException; |
48 | |
import java.io.OutputStream; |
49 | |
import java.io.PrintStream; |
50 | |
import java.io.PrintWriter; |
51 | |
import java.net.MalformedURLException; |
52 | |
import java.net.URISyntaxException; |
53 | |
import java.net.URL; |
54 | |
|
55 | |
import static org.apache.any23.extractor.ExtractionParameters.ValidationMode; |
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
@ToolRunner.Description("Any23 Command Line Tool.") |
66 | 0 | public class Rover implements Tool { |
67 | |
|
68 | 0 | private static final String[] FORMATS = WriterRegistry.getInstance().getIdentifiers(); |
69 | |
private static final int DEFAULT_FORMAT_INDEX = 0; |
70 | |
|
71 | 0 | private static final Logger logger = LoggerFactory.getLogger(Rover.class); |
72 | |
|
73 | |
private Options options; |
74 | |
|
75 | |
private CommandLine commandLine; |
76 | |
|
77 | 0 | private boolean verbose = false; |
78 | |
|
79 | |
private PrintStream outputStream; |
80 | |
private TripleHandler tripleHandler; |
81 | |
private ReportingTripleHandler reportingTripleHandler; |
82 | |
private BenchmarkTripleHandler benchmarkTripleHandler; |
83 | |
|
84 | |
private ExtractionParameters eps; |
85 | |
private Any23 any23; |
86 | |
|
87 | |
protected boolean isVerbose() { |
88 | 0 | return verbose; |
89 | |
} |
90 | |
|
91 | |
public static void main(String[] args) { |
92 | 0 | System.exit( new Rover().run(args) ); |
93 | 0 | } |
94 | |
|
95 | |
public int run(String[] args) { |
96 | |
try { |
97 | 0 | final String[] uris = configure(args); |
98 | 0 | performExtraction(uris); |
99 | 0 | return 0; |
100 | 0 | } catch (Exception e) { |
101 | 0 | System.err.println( e.getMessage() ); |
102 | 0 | final int exitCode = e instanceof ExitCodeException ? ((ExitCodeException) e).exitCode : 1; |
103 | 0 | if(verbose) e.printStackTrace(System.err); |
104 | 0 | return exitCode; |
105 | |
} |
106 | |
} |
107 | |
|
108 | |
protected CommandLine getCommandLine() { |
109 | 0 | if(commandLine == null) throw new IllegalStateException("Rover must be configured first."); |
110 | 0 | return commandLine; |
111 | |
} |
112 | |
|
113 | |
protected String[] configure(String[] args) throws Exception { |
114 | 0 | final CommandLineParser parser = new PosixParser(); |
115 | 0 | options = createOptions(); |
116 | 0 | commandLine = parser.parse(options, args); |
117 | |
|
118 | 0 | if (commandLine.hasOption("h")) { |
119 | 0 | printHelp(); |
120 | 0 | throw new ExitCodeException(0); |
121 | |
} |
122 | |
|
123 | 0 | if (commandLine.hasOption('v')) { |
124 | 0 | verbose = true; |
125 | 0 | LogUtils.setVerboseLogging(); |
126 | |
} else { |
127 | 0 | LogUtils.setDefaultLogging(); |
128 | |
} |
129 | |
|
130 | 0 | if (commandLine.getArgs().length < 1) { |
131 | 0 | printHelp(); |
132 | 0 | throw new IllegalArgumentException("Expected at least 1 argument."); |
133 | |
} |
134 | |
|
135 | 0 | final String[] inputURIs = argumentsToURIs(commandLine.getArgs()); |
136 | 0 | final String[] extractorNames = getExtractors(commandLine); |
137 | |
|
138 | |
try { |
139 | 0 | outputStream = getOutputStream(commandLine); |
140 | 0 | tripleHandler = getTripleHandler(commandLine, outputStream); |
141 | 0 | tripleHandler = decorateWithLogHandler(commandLine, tripleHandler); |
142 | 0 | tripleHandler = decorateWithStatisticsHandler(commandLine, tripleHandler); |
143 | |
|
144 | 0 | benchmarkTripleHandler = |
145 | |
tripleHandler instanceof BenchmarkTripleHandler ? (BenchmarkTripleHandler) tripleHandler : null; |
146 | |
|
147 | 0 | tripleHandler = decorateWithAccidentalTriplesFilter(commandLine, tripleHandler); |
148 | |
|
149 | 0 | reportingTripleHandler = new ReportingTripleHandler(tripleHandler); |
150 | 0 | eps = getExtractionParameters(commandLine); |
151 | 0 | any23 = createAny23(extractorNames); |
152 | |
|
153 | 0 | return inputURIs; |
154 | 0 | } catch (Exception e) { |
155 | 0 | closeStreams(); |
156 | 0 | throw e; |
157 | |
} |
158 | |
} |
159 | |
|
160 | |
protected Options createOptions() { |
161 | 0 | final Options options = new Options(); |
162 | 0 | options.addOption( |
163 | |
new Option("v", "verbose", false, "Show debug and progress information.") |
164 | |
); |
165 | 0 | options.addOption( |
166 | |
new Option("h", "help", false, "Print this help.") |
167 | |
); |
168 | 0 | options.addOption( |
169 | |
new Option("e", true, "Specify a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle.") |
170 | |
); |
171 | 0 | options.addOption( |
172 | |
new Option("o", "output", true, "Specify Output file (defaults to standard output).") |
173 | |
); |
174 | 0 | options.addOption( |
175 | |
new Option( |
176 | |
"f", |
177 | |
"Output format", |
178 | |
true, |
179 | |
"[" + printFormats(FORMATS, DEFAULT_FORMAT_INDEX) + "]" |
180 | |
) |
181 | |
); |
182 | 0 | options.addOption( |
183 | |
new Option("t", "notrivial", false, "Filter trivial statements (e.g. CSS related ones).") |
184 | |
); |
185 | 0 | options.addOption( |
186 | |
new Option("s", "stats", false, "Print out extraction statistics.") |
187 | |
); |
188 | 0 | options.addOption( |
189 | |
new Option("l", "log", true, "Produce log within a file.") |
190 | |
); |
191 | 0 | options.addOption( |
192 | |
new Option("p", "pedantic", false, "Validate and fixes HTML content detecting commons issues.") |
193 | |
); |
194 | 0 | options.addOption( |
195 | |
new Option("n", "nesting", false, "Disable production of nesting triples.") |
196 | |
); |
197 | 0 | options.addOption( |
198 | |
new Option("d", "defaultns", true, "Override the default namespace used to produce statements.") |
199 | |
); |
200 | 0 | return options; |
201 | |
} |
202 | |
|
203 | |
protected void performExtraction(DocumentSource documentSource) { |
204 | 0 | performExtraction(any23, eps, documentSource, reportingTripleHandler); |
205 | 0 | } |
206 | |
|
207 | |
protected void performExtraction(String[] inputURIs) throws URISyntaxException, IOException { |
208 | |
try { |
209 | 0 | final long start = System.currentTimeMillis(); |
210 | 0 | for (String inputURI : inputURIs) { |
211 | 0 | performExtraction( any23.createDocumentSource(inputURI) ); |
212 | |
} |
213 | 0 | final long elapsed = System.currentTimeMillis() - start; |
214 | |
|
215 | 0 | if (benchmarkTripleHandler != null) { |
216 | 0 | System.err.println(benchmarkTripleHandler.report()); |
217 | |
} |
218 | |
|
219 | 0 | logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames()); |
220 | 0 | logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms"); |
221 | |
} finally { |
222 | 0 | closeStreams(); |
223 | 0 | } |
224 | 0 | } |
225 | |
|
226 | |
protected String printReports() { |
227 | 0 | final StringBuilder sb = new StringBuilder(); |
228 | 0 | if(benchmarkTripleHandler != null) sb.append( benchmarkTripleHandler.report() ).append('\n'); |
229 | 0 | if(reportingTripleHandler != null) sb.append( reportingTripleHandler.printReport() ).append('\n'); |
230 | 0 | return sb.toString(); |
231 | |
} |
232 | |
|
233 | |
private void printHelp() { |
234 | 0 | HelpFormatter formatter = new HelpFormatter(); |
235 | 0 | formatter.printHelp("[{<url>|<file>}]+", options, true); |
236 | 0 | } |
237 | |
|
238 | |
private String printFormats(String[] formats, int defaultIndex) { |
239 | 0 | final StringBuilder sb = new StringBuilder(); |
240 | 0 | for (int i = 0; i < formats.length; i++) { |
241 | 0 | sb.append(formats[i]); |
242 | 0 | if(i == defaultIndex) sb.append(" (default)"); |
243 | 0 | if(i < formats.length - 1) sb.append(", "); |
244 | |
} |
245 | 0 | return sb.toString(); |
246 | |
} |
247 | |
|
248 | |
private String argumentToURI(String uri) { |
249 | 0 | uri = uri.trim(); |
250 | 0 | if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) { |
251 | |
try { |
252 | 0 | return new URL(uri).toString(); |
253 | 0 | } catch (MalformedURLException murle) { |
254 | 0 | throw new IllegalArgumentException(String.format("Invalid URI: '%s'", uri), murle); |
255 | |
} |
256 | |
} |
257 | |
|
258 | 0 | final File f = new File(uri); |
259 | 0 | if (!f.exists()) { |
260 | 0 | throw new IllegalArgumentException(String.format("No such file: [%s]", f.getAbsolutePath())); |
261 | |
} |
262 | 0 | if (f.isDirectory()) { |
263 | 0 | throw new IllegalArgumentException(String.format("Found a directory: [%s]", f.getAbsolutePath())); |
264 | |
} |
265 | 0 | return f.toURI().toString(); |
266 | |
} |
267 | |
|
268 | |
protected String[] argumentsToURIs(String[] args) { |
269 | 0 | final String[] uris = new String[args.length]; |
270 | 0 | for(int i = 0; i < args.length; i++) { |
271 | 0 | uris[i] = argumentToURI(args[i]); |
272 | |
} |
273 | 0 | return uris; |
274 | |
} |
275 | |
|
276 | |
private String[] getExtractors(CommandLine cl) { |
277 | 0 | if (cl.hasOption('e')) { |
278 | 0 | return cl.getOptionValue('e').split(","); |
279 | |
} |
280 | 0 | return null; |
281 | |
} |
282 | |
|
283 | |
private PrintStream openPrintStream(String fileName) { |
284 | 0 | final File file = new File(fileName); |
285 | |
try { |
286 | 0 | return new PrintStream(file); |
287 | 0 | } catch (FileNotFoundException fnfe) { |
288 | 0 | throw new IllegalArgumentException("Cannot open file '" + file.getAbsolutePath() + "'", fnfe); |
289 | |
} |
290 | |
} |
291 | |
|
292 | |
private PrintStream getOutputStream(CommandLine cl) { |
293 | 0 | if (cl.hasOption("o")) { |
294 | 0 | final String fileName = cl.getOptionValue("o"); |
295 | 0 | return openPrintStream(fileName); |
296 | |
} else { |
297 | 0 | return System.out; |
298 | |
} |
299 | |
} |
300 | |
|
301 | |
private TripleHandler getTripleHandler(CommandLine cl, OutputStream os) { |
302 | 0 | final String FORMAT_OPTION = "f"; |
303 | 0 | String format = FORMATS[DEFAULT_FORMAT_INDEX]; |
304 | 0 | if (cl.hasOption(FORMAT_OPTION)) { |
305 | 0 | format = cl.getOptionValue(FORMAT_OPTION).toLowerCase(); |
306 | |
} |
307 | |
try { |
308 | 0 | return WriterRegistry.getInstance().getWriterInstanceByIdentifier(format, os); |
309 | 0 | } catch (Exception e) { |
310 | 0 | throw new IllegalArgumentException( |
311 | |
String.format("Invalid option value '%s' for option %s", format, FORMAT_OPTION) |
312 | |
); |
313 | |
} |
314 | |
} |
315 | |
|
316 | |
private TripleHandler decorateWithAccidentalTriplesFilter(CommandLine cl, TripleHandler in) { |
317 | 0 | if (cl.hasOption('t')) { |
318 | 0 | return new IgnoreAccidentalRDFa( |
319 | |
new IgnoreTitlesOfEmptyDocuments(in), |
320 | |
true |
321 | |
); |
322 | |
} |
323 | 0 | return in; |
324 | |
} |
325 | |
|
326 | |
private TripleHandler decorateWithStatisticsHandler(CommandLine cl, TripleHandler in) { |
327 | 0 | if (cl.hasOption('s')) { |
328 | 0 | return new BenchmarkTripleHandler(in); |
329 | |
} |
330 | 0 | return in; |
331 | |
} |
332 | |
|
333 | |
private TripleHandler decorateWithLogHandler(CommandLine cl, TripleHandler in) { |
334 | 0 | if (cl.hasOption('l')) { |
335 | 0 | File logFile = new File(cl.getOptionValue('l')); |
336 | |
try { |
337 | 0 | return new LoggingTripleHandler(in, new PrintWriter(logFile)); |
338 | 0 | } catch (FileNotFoundException fnfe) { |
339 | 0 | throw new IllegalArgumentException( String.format("Could not write to log file [%s]", logFile), fnfe ); |
340 | |
} |
341 | |
} |
342 | 0 | return in; |
343 | |
} |
344 | |
|
345 | |
private ExtractionParameters getExtractionParameters(CommandLine cl) { |
346 | 0 | final boolean nestingDisabled = ! cl.hasOption('n'); |
347 | 0 | final Configuration configuration = DefaultConfiguration.singleton(); |
348 | 0 | final ExtractionParameters extractionParameters = |
349 | |
cl.hasOption('p') |
350 | |
? |
351 | |
new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled) |
352 | |
: |
353 | |
new ExtractionParameters(configuration, ValidationMode.None , nestingDisabled); |
354 | 0 | if( cl.hasOption('d') ) { |
355 | 0 | extractionParameters.setProperty( |
356 | |
SingleDocumentExtraction.EXTRACTION_CONTEXT_URI_PROPERTY, |
357 | |
cl.getOptionValue('d') |
358 | |
); |
359 | |
} |
360 | 0 | return extractionParameters; |
361 | |
} |
362 | |
|
363 | |
private Any23 createAny23(String[] extractorNames) { |
364 | 0 | Any23 any23 = (extractorNames == null || extractorNames.length == 0) |
365 | |
? new Any23() |
366 | |
: new Any23(extractorNames); |
367 | 0 | any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION); |
368 | 0 | return any23; |
369 | |
} |
370 | |
|
371 | |
private void performExtraction( |
372 | |
Any23 any23, ExtractionParameters eps, DocumentSource documentSource, TripleHandler th |
373 | |
) { |
374 | |
try { |
375 | 0 | if (! any23.extract(eps, documentSource, th).hasMatchingExtractors()) { |
376 | 0 | throw new ExitCodeException("No suitable extractors found.", 2); |
377 | |
} |
378 | 0 | } catch (ExtractionException ex) { |
379 | 0 | throw new ExitCodeException("Exception while extracting metadata.", ex, 3); |
380 | 0 | } catch (IOException ex) { |
381 | 0 | throw new ExitCodeException("Exception while producing output.", ex, 4); |
382 | 0 | } |
383 | 0 | } |
384 | |
|
385 | |
private void closeHandler() { |
386 | 0 | if(tripleHandler == null) return; |
387 | |
try { |
388 | 0 | tripleHandler.close(); |
389 | 0 | } catch (TripleHandlerException the) { |
390 | 0 | throw new ExitCodeException("Error while closing TripleHandler", the, 5); |
391 | 0 | } |
392 | 0 | } |
393 | |
|
394 | |
private void closeStreams() { |
395 | 0 | closeHandler(); |
396 | 0 | if(outputStream != null) outputStream.close(); |
397 | 0 | } |
398 | |
|
399 | 0 | protected class ExitCodeException extends RuntimeException { |
400 | |
|
401 | |
private final int exitCode; |
402 | |
|
403 | 0 | public ExitCodeException(String message, Throwable cause, int exitCode) { |
404 | 0 | super(message, cause); |
405 | 0 | this.exitCode = exitCode; |
406 | 0 | } |
407 | 0 | public ExitCodeException(String message, int exitCode) { |
408 | 0 | super(message); |
409 | 0 | this.exitCode = exitCode; |
410 | 0 | } |
411 | 0 | public ExitCodeException(int exitCode) { |
412 | 0 | super(); |
413 | 0 | this.exitCode = exitCode; |
414 | 0 | } |
415 | |
|
416 | |
protected int getExitCode() { |
417 | 0 | return exitCode; |
418 | |
} |
419 | |
} |
420 | |
|
421 | |
} |