Coverage Report - org.apache.any23.cli.Rover
 
Classes in this File Line Coverage Branch Coverage Complexity
Rover
0%
0/156
0%
0/66
3.276
Rover$ExitCodeException
0%
0/14
N/A
3.276
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.cli;
 19  
 
 20  
 import org.apache.any23.Any23;
 21  
 import org.apache.any23.configuration.Configuration;
 22  
 import org.apache.any23.configuration.DefaultConfiguration;
 23  
 import org.apache.any23.extractor.ExtractionException;
 24  
 import org.apache.any23.extractor.ExtractionParameters;
 25  
 import org.apache.any23.extractor.SingleDocumentExtraction;
 26  
 import org.apache.any23.filter.IgnoreAccidentalRDFa;
 27  
 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
 28  
 import org.apache.any23.source.DocumentSource;
 29  
 import org.apache.any23.util.LogUtils;
 30  
 import org.apache.any23.writer.BenchmarkTripleHandler;
 31  
 import org.apache.any23.writer.LoggingTripleHandler;
 32  
 import org.apache.any23.writer.ReportingTripleHandler;
 33  
 import org.apache.any23.writer.TripleHandler;
 34  
 import org.apache.any23.writer.TripleHandlerException;
 35  
 import org.apache.any23.writer.WriterRegistry;
 36  
 import org.apache.commons.cli.CommandLine;
 37  
 import org.apache.commons.cli.CommandLineParser;
 38  
 import org.apache.commons.cli.HelpFormatter;
 39  
 import org.apache.commons.cli.Option;
 40  
 import org.apache.commons.cli.Options;
 41  
 import org.apache.commons.cli.PosixParser;
 42  
 import org.slf4j.Logger;
 43  
 import org.slf4j.LoggerFactory;
 44  
 
 45  
 import java.io.File;
 46  
 import java.io.FileNotFoundException;
 47  
 import java.io.IOException;
 48  
 import java.io.OutputStream;
 49  
 import java.io.PrintStream;
 50  
 import java.io.PrintWriter;
 51  
 import java.net.MalformedURLException;
 52  
 import java.net.URISyntaxException;
 53  
 import java.net.URL;
 54  
 
 55  
 import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
 56  
 
 57  
 /**
 58  
  * A default rover implementation. Goes and fetches a URL using an hint
 59  
  * as to what format should require, then tries to convert it to RDF.
 60  
  *
 61  
  * @author Michele Mostarda (mostarda@fbk.eu)
 62  
  * @author Richard Cyganiak (richard@cyganiak.de)
 63  
  * @author Gabriele Renzi
 64  
  */
 65  
 @ToolRunner.Description("Any23 Command Line Tool.")
 66  0
 public class Rover implements Tool {
 67  
 
 68  0
     private static final String[] FORMATS = WriterRegistry.getInstance().getIdentifiers();
 69  
     private static final int DEFAULT_FORMAT_INDEX = 0;
 70  
 
 71  0
     private static final Logger logger = LoggerFactory.getLogger(Rover.class);
 72  
 
 73  
     private Options options;
 74  
 
 75  
     private CommandLine commandLine;
 76  
 
 77  0
     private boolean verbose = false;
 78  
 
 79  
     private PrintStream outputStream;
 80  
     private TripleHandler tripleHandler;
 81  
     private ReportingTripleHandler reportingTripleHandler;
 82  
     private BenchmarkTripleHandler benchmarkTripleHandler;
 83  
 
 84  
     private ExtractionParameters eps;
 85  
     private Any23 any23;
 86  
 
 87  
     protected boolean isVerbose() {
 88  0
         return verbose;
 89  
     }
 90  
 
 91  
     public static void main(String[] args) {
 92  0
         System.exit( new Rover().run(args) );
 93  0
     }
 94  
 
 95  
     public int run(String[] args) {
 96  
         try {
 97  0
             final String[] uris = configure(args);
 98  0
             performExtraction(uris);
 99  0
             return 0;
 100  0
         } catch (Exception e) {
 101  0
             System.err.println( e.getMessage() );
 102  0
             final int exitCode = e instanceof ExitCodeException ? ((ExitCodeException) e).exitCode : 1;
 103  0
             if(verbose) e.printStackTrace(System.err);
 104  0
             return exitCode;
 105  
         }
 106  
     }
 107  
 
 108  
     protected CommandLine getCommandLine() {
 109  0
         if(commandLine == null) throw new IllegalStateException("Rover must be configured first.");
 110  0
         return commandLine;
 111  
     }
 112  
 
 113  
     protected String[] configure(String[] args) throws Exception {
 114  0
         final CommandLineParser parser = new PosixParser();
 115  0
         options = createOptions();
 116  0
         commandLine = parser.parse(options, args);
 117  
 
 118  0
         if (commandLine.hasOption("h")) {
 119  0
             printHelp();
 120  0
             throw new ExitCodeException(0);
 121  
         }
 122  
 
 123  0
         if (commandLine.hasOption('v')) {
 124  0
             verbose = true;
 125  0
             LogUtils.setVerboseLogging();
 126  
         } else {
 127  0
             LogUtils.setDefaultLogging();
 128  
         }
 129  
 
 130  0
         if (commandLine.getArgs().length < 1) {
 131  0
             printHelp();
 132  0
             throw new IllegalArgumentException("Expected at least 1 argument.");
 133  
         }
 134  
 
 135  0
         final String[] inputURIs = argumentsToURIs(commandLine.getArgs());
 136  0
         final String[] extractorNames = getExtractors(commandLine);
 137  
 
 138  
         try {
 139  0
             outputStream  = getOutputStream(commandLine);
 140  0
             tripleHandler = getTripleHandler(commandLine, outputStream);
 141  0
             tripleHandler = decorateWithLogHandler(commandLine, tripleHandler);
 142  0
             tripleHandler = decorateWithStatisticsHandler(commandLine, tripleHandler);
 143  
 
 144  0
             benchmarkTripleHandler =
 145  
                     tripleHandler instanceof BenchmarkTripleHandler ? (BenchmarkTripleHandler) tripleHandler : null;
 146  
 
 147  0
             tripleHandler = decorateWithAccidentalTriplesFilter(commandLine, tripleHandler);
 148  
 
 149  0
             reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
 150  0
             eps = getExtractionParameters(commandLine);
 151  0
             any23 = createAny23(extractorNames);
 152  
 
 153  0
             return inputURIs;
 154  0
         } catch (Exception e) {
 155  0
             closeStreams();
 156  0
             throw e;
 157  
         }
 158  
     }
 159  
 
 160  
     protected Options createOptions() {
 161  0
         final Options options = new Options();
 162  0
         options.addOption(
 163  
                 new Option("v", "verbose", false, "Show debug and progress information.")
 164  
         );
 165  0
         options.addOption(
 166  
                 new Option("h", "help", false, "Print this help.")
 167  
         );
 168  0
         options.addOption(
 169  
                 new Option("e", true, "Specify a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle.")
 170  
         );
 171  0
         options.addOption(
 172  
                 new Option("o", "output", true, "Specify Output file (defaults to standard output).")
 173  
         );
 174  0
         options.addOption(
 175  
                 new Option(
 176  
                         "f",
 177  
                         "Output format",
 178  
                         true,
 179  
                         "[" +  printFormats(FORMATS, DEFAULT_FORMAT_INDEX) + "]"
 180  
                 )
 181  
         );
 182  0
         options.addOption(
 183  
                 new Option("t", "notrivial", false, "Filter trivial statements (e.g. CSS related ones).")
 184  
         );
 185  0
         options.addOption(
 186  
                 new Option("s", "stats", false, "Print out extraction statistics.")
 187  
         );
 188  0
         options.addOption(
 189  
                 new Option("l", "log", true, "Produce log within a file.")
 190  
         );
 191  0
         options.addOption(
 192  
                 new Option("p", "pedantic", false, "Validate and fixes HTML content detecting commons issues.")
 193  
         );
 194  0
         options.addOption(
 195  
                 new Option("n", "nesting", false, "Disable production of nesting triples.")
 196  
         );
 197  0
         options.addOption(
 198  
                 new Option("d", "defaultns", true, "Override the default namespace used to produce statements.")
 199  
         );
 200  0
         return options;
 201  
     }
 202  
 
 203  
     protected void performExtraction(DocumentSource documentSource) {
 204  0
         performExtraction(any23, eps, documentSource, reportingTripleHandler);
 205  0
     }
 206  
 
 207  
     protected void performExtraction(String[] inputURIs) throws URISyntaxException, IOException {
 208  
         try {
 209  0
             final long start = System.currentTimeMillis();
 210  0
             for (String inputURI : inputURIs) {
 211  0
                 performExtraction( any23.createDocumentSource(inputURI) );
 212  
             }
 213  0
             final long elapsed = System.currentTimeMillis() - start;
 214  
 
 215  0
             if (benchmarkTripleHandler != null) {
 216  0
                 System.err.println(benchmarkTripleHandler.report());
 217  
             }
 218  
 
 219  0
             logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
 220  0
             logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
 221  
         } finally {
 222  0
             closeStreams();
 223  0
         }
 224  0
     }
 225  
 
 226  
     protected String printReports() {
 227  0
         final StringBuilder sb = new StringBuilder();
 228  0
         if(benchmarkTripleHandler != null) sb.append( benchmarkTripleHandler.report() ).append('\n');
 229  0
         if(reportingTripleHandler != null) sb.append( reportingTripleHandler.printReport() ).append('\n');
 230  0
         return sb.toString();
 231  
     }
 232  
 
 233  
     private void printHelp() {
 234  0
         HelpFormatter formatter = new HelpFormatter();
 235  0
         formatter.printHelp("[{<url>|<file>}]+", options, true);
 236  0
     }
 237  
 
 238  
     private String printFormats(String[] formats, int defaultIndex) {
 239  0
         final StringBuilder sb = new StringBuilder();
 240  0
         for (int i = 0; i < formats.length; i++) {
 241  0
             sb.append(formats[i]);
 242  0
             if(i == defaultIndex) sb.append(" (default)");
 243  0
             if(i < formats.length - 1) sb.append(", ");
 244  
         }
 245  0
         return sb.toString();
 246  
     }
 247  
 
 248  
     private String argumentToURI(String uri) {
 249  0
         uri = uri.trim();
 250  0
         if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) {
 251  
             try {
 252  0
                 return new URL(uri).toString();
 253  0
             } catch (MalformedURLException murle) {
 254  0
                 throw new IllegalArgumentException(String.format("Invalid URI: '%s'", uri), murle);
 255  
             }
 256  
         }
 257  
 
 258  0
         final File f = new File(uri);
 259  0
         if (!f.exists()) {
 260  0
             throw new IllegalArgumentException(String.format("No such file: [%s]", f.getAbsolutePath()));
 261  
         }
 262  0
         if (f.isDirectory()) {
 263  0
             throw new IllegalArgumentException(String.format("Found a directory: [%s]", f.getAbsolutePath()));
 264  
         }
 265  0
         return f.toURI().toString();
 266  
     }
 267  
 
 268  
     protected String[] argumentsToURIs(String[] args) {
 269  0
         final String[] uris = new String[args.length];
 270  0
         for(int i = 0; i < args.length; i++) {
 271  0
             uris[i] = argumentToURI(args[i]);
 272  
         }
 273  0
         return uris;
 274  
     }
 275  
 
 276  
     private String[] getExtractors(CommandLine cl) {
 277  0
          if (cl.hasOption('e')) {
 278  0
              return cl.getOptionValue('e').split(",");
 279  
          }
 280  0
          return null;
 281  
      }
 282  
 
 283  
     private PrintStream openPrintStream(String fileName) {
 284  0
         final File file = new File(fileName);
 285  
         try {
 286  0
             return new PrintStream(file);
 287  0
         } catch (FileNotFoundException fnfe) {
 288  0
             throw new IllegalArgumentException("Cannot open file '" + file.getAbsolutePath() + "'", fnfe);
 289  
         }
 290  
     }
 291  
 
 292  
     private PrintStream getOutputStream(CommandLine cl) {
 293  0
         if (cl.hasOption("o")) {
 294  0
             final String fileName = cl.getOptionValue("o");
 295  0
             return openPrintStream(fileName);
 296  
         } else {
 297  0
             return System.out;
 298  
         }
 299  
     }
 300  
 
 301  
     private TripleHandler getTripleHandler(CommandLine cl, OutputStream os) {
 302  0
         final String FORMAT_OPTION = "f";
 303  0
         String format = FORMATS[DEFAULT_FORMAT_INDEX];
 304  0
         if (cl.hasOption(FORMAT_OPTION)) {
 305  0
             format = cl.getOptionValue(FORMAT_OPTION).toLowerCase();
 306  
         }
 307  
         try {
 308  0
             return WriterRegistry.getInstance().getWriterInstanceByIdentifier(format, os);
 309  0
         } catch (Exception e) {
 310  0
             throw new IllegalArgumentException(
 311  
                     String.format("Invalid option value '%s' for option %s", format, FORMAT_OPTION)
 312  
             );
 313  
         }
 314  
     }
 315  
 
 316  
     private TripleHandler decorateWithAccidentalTriplesFilter(CommandLine cl, TripleHandler in) {
 317  0
         if (cl.hasOption('t')) {
 318  0
             return new IgnoreAccidentalRDFa(
 319  
                     new IgnoreTitlesOfEmptyDocuments(in),
 320  
                     true    // suppress stylesheet triples.
 321  
             );
 322  
         }
 323  0
         return in;
 324  
     }
 325  
 
 326  
     private TripleHandler decorateWithStatisticsHandler(CommandLine cl, TripleHandler in) {
 327  0
         if (cl.hasOption('s')) {
 328  0
             return new BenchmarkTripleHandler(in);
 329  
         }
 330  0
         return in;
 331  
     }
 332  
 
 333  
     private TripleHandler decorateWithLogHandler(CommandLine cl, TripleHandler in) {
 334  0
         if (cl.hasOption('l')) {
 335  0
             File logFile = new File(cl.getOptionValue('l'));
 336  
             try {
 337  0
                 return new LoggingTripleHandler(in, new PrintWriter(logFile));
 338  0
             } catch (FileNotFoundException fnfe) {
 339  0
                 throw new IllegalArgumentException( String.format("Could not write to log file [%s]", logFile), fnfe );
 340  
             }
 341  
         }
 342  0
         return in;
 343  
     }
 344  
 
 345  
     private ExtractionParameters getExtractionParameters(CommandLine cl) {
 346  0
         final boolean nestingDisabled = ! cl.hasOption('n');
 347  0
         final Configuration configuration = DefaultConfiguration.singleton();
 348  0
         final ExtractionParameters extractionParameters =
 349  
                 cl.hasOption('p')
 350  
                         ?
 351  
                 new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled)
 352  
                         :
 353  
                 new ExtractionParameters(configuration, ValidationMode.None          , nestingDisabled);
 354  0
         if( cl.hasOption('d') ) {
 355  0
             extractionParameters.setProperty(
 356  
                     SingleDocumentExtraction.EXTRACTION_CONTEXT_URI_PROPERTY,
 357  
                     cl.getOptionValue('d')
 358  
             );
 359  
         }
 360  0
         return extractionParameters;
 361  
     }
 362  
 
 363  
     private Any23 createAny23(String[] extractorNames) {
 364  0
         Any23 any23 = (extractorNames == null || extractorNames.length == 0)
 365  
                 ? new Any23()
 366  
                 : new Any23(extractorNames);
 367  0
         any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
 368  0
         return any23;
 369  
     }
 370  
 
 371  
     private void performExtraction(
 372  
             Any23 any23, ExtractionParameters eps, DocumentSource documentSource, TripleHandler th
 373  
     ) {
 374  
         try {
 375  0
             if (! any23.extract(eps, documentSource, th).hasMatchingExtractors()) {
 376  0
                 throw new ExitCodeException("No suitable extractors found.", 2);
 377  
             }
 378  0
         } catch (ExtractionException ex) {
 379  0
             throw new ExitCodeException("Exception while extracting metadata.", ex, 3);
 380  0
         } catch (IOException ex) {
 381  0
             throw new ExitCodeException("Exception while producing output.", ex, 4);
 382  0
         }
 383  0
     }
 384  
 
 385  
     private void closeHandler() {
 386  0
         if(tripleHandler == null) return;
 387  
         try {
 388  0
             tripleHandler.close();
 389  0
         } catch (TripleHandlerException the) {
 390  0
             throw new ExitCodeException("Error while closing TripleHandler", the, 5);
 391  0
         }
 392  0
     }
 393  
 
 394  
     private void closeStreams() {
 395  0
              closeHandler();
 396  0
             if(outputStream != null) outputStream.close();
 397  0
     }
 398  
 
 399  0
     protected class ExitCodeException extends RuntimeException {
 400  
 
 401  
         private final int exitCode;
 402  
 
 403  0
         public ExitCodeException(String message, Throwable cause, int exitCode) {
 404  0
             super(message, cause);
 405  0
             this.exitCode = exitCode;
 406  0
         }
 407  0
         public ExitCodeException(String message, int exitCode) {
 408  0
             super(message);
 409  0
             this.exitCode = exitCode;
 410  0
         }
 411  0
         public ExitCodeException(int exitCode) {
 412  0
             super();
 413  0
             this.exitCode = exitCode;
 414  0
         }
 415  
 
 416  
         protected int getExitCode() {
 417  0
             return exitCode;
 418  
         }
 419  
     }
 420  
 
 421  
 }