Coverage Report - org.apache.any23.cli.MicrodataParser
 
Classes in this File Line Coverage Branch Coverage Complexity
MicrodataParser
0%
0/24
0%
0/10
4.333
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.cli;
 19  
 
 20  
 import org.apache.any23.extractor.html.TagSoupParser;
 21  
 import org.apache.any23.http.DefaultHTTPClient;
 22  
 import org.apache.any23.source.DocumentSource;
 23  
 import org.apache.any23.source.FileDocumentSource;
 24  
 import org.apache.any23.source.HTTPDocumentSource;
 25  
 import org.apache.any23.util.StreamUtils;
 26  
 
 27  
 import java.io.File;
 28  
 import java.io.IOException;
 29  
 import java.io.InputStream;
 30  
 import java.net.URISyntaxException;
 31  
 import java.util.regex.Matcher;
 32  
 import java.util.regex.Pattern;
 33  
 
 34  
 /**
 35  
  * Command line <b>Microdata</i> parser, accepting both files and URLs and
 36  
  * returing a <i>JSON</i> representation of the extracted metadata as described at
 37  
  * <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
 38  
  *
 39  
  * @author Michele Mostarda (mostarda@fbk.eu)
 40  
  */
 41  
 @ToolRunner.Description("Commandline Tool for extracting Microdata from file/HTTP source.")
 42  0
 public class MicrodataParser implements Tool {
 43  
 
 44  
     private static final String HTTP_DOCUMENT_SOURCE = "^https?://.*";
 45  
     private static final String FILE_DOCUMENT_SOURCE = "^file:(.*)$";
 46  
 
 47  
     public static void main(String[] args) throws URISyntaxException, IOException {
 48  0
         System.exit( new MicrodataParser().run(args) );
 49  0
     }
 50  
 
 51  
     public int run(String[] args) {
 52  0
         if(args.length != 1) {
 53  0
             System.err.println("USAGE: {http://path/to/resource.html|file:/path/to/local.file}");
 54  0
             return 1;
 55  
         }
 56  0
         InputStream documentInputInputStream = null;
 57  
         try {
 58  0
             final DocumentSource documentSource = getDocumentSource(args[0]);
 59  0
             documentInputInputStream = documentSource.openInputStream();
 60  0
             final TagSoupParser tagSoupParser = new TagSoupParser(
 61  
                     documentInputInputStream,
 62  
                     documentSource.getDocumentURI()
 63  
             );
 64  0
             org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
 65  0
         } catch (Exception e) {
 66  0
             System.err.println("***ERROR: " + e.getMessage());
 67  0
             e.printStackTrace();
 68  0
             return 1;
 69  
         } finally {
 70  0
             if(documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
 71  
         }
 72  0
         return 0;
 73  
     }
 74  
 
 75  
     private DocumentSource getDocumentSource(String source) throws URISyntaxException {
 76  0
         final Matcher httpMatcher = Pattern.compile(HTTP_DOCUMENT_SOURCE).matcher(source);
 77  0
         if(httpMatcher.find()) {
 78  0
             return new HTTPDocumentSource(new DefaultHTTPClient(), source);
 79  
         }
 80  0
         final Matcher fileMatcher = Pattern.compile(FILE_DOCUMENT_SOURCE).matcher(source);
 81  0
         if(fileMatcher.find()) {
 82  0
             return new FileDocumentSource( new File( fileMatcher.group(1) ) );
 83  
         }
 84  0
         throw new IllegalArgumentException("Invalid source protocol: '" + source + "'");
 85  
     }
 86  
 
 87  
 }