Coverage Report - org.apache.any23.extractor.html.MicroformatExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
MicroformatExtractor
0%
0/53
0%
0/24
1.833
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionContext;
 21  
 import org.apache.any23.extractor.ExtractionException;
 22  
 import org.apache.any23.extractor.ExtractionParameters;
 23  
 import org.apache.any23.extractor.ExtractionResult;
 24  
 import org.apache.any23.extractor.ExtractorDescription;
 25  
 import org.apache.any23.extractor.TagSoupExtractionResult;
 26  
 import org.apache.any23.extractor.html.annotations.Includes;
 27  
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 28  
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 29  
 import org.openrdf.model.BNode;
 30  
 import org.openrdf.model.Literal;
 31  
 import org.openrdf.model.Resource;
 32  
 import org.openrdf.model.URI;
 33  
 import org.openrdf.model.impl.ValueFactoryImpl;
 34  
 import org.w3c.dom.Document;
 35  
 import org.w3c.dom.Node;
 36  
 
 37  
 import java.io.IOException;
 38  
 
 39  
 /**
 40  
  * The abstract base class for any
 41  
  * <a href="microformats.org/">Microformat specification</a> extractor.
 42  
  */
 43  0
 public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
 44  
 
 45  
     public static final String BEGIN_SCRIPT = "<script>";
 46  
     public static final String END_SCRIPT   = "</script>";
 47  
 
 48  
     private HTMLDocument htmlDocument;
 49  
 
 50  
     private ExtractionContext context;
 51  
 
 52  
     private URI documentURI;
 53  
 
 54  
     private ExtractionResult out;
 55  
 
 56  0
     protected final Any23ValueFactoryWrapper valueFactory =
 57  
             new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
 58  
 
 59  
     /**
 60  
      * Returns the description of this extractor.
 61  
      *
 62  
      * @return a human readable description.
 63  
      */
 64  
     public abstract ExtractorDescription getDescription();
 65  
 
 66  
     /**
 67  
      * Performs the extraction of the data and writes them to the model.
 68  
      * The nodes generated in the model can have any name or implicit label
 69  
      * but if possible they </i>SHOULD</i> have names (either URIs or AnonId) that
 70  
      * are uniquely derivable from their position in the DOM tree, so that
 71  
      * multiple extractors can merge information.
 72  
      */
 73  
     protected abstract boolean extract() throws ExtractionException;
 74  
 
 75  
     public HTMLDocument getHTMLDocument() {
 76  0
         return htmlDocument;
 77  
     }
 78  
 
 79  
     public ExtractionContext getExtractionContext() {
 80  0
         return context;
 81  
     }
 82  
 
 83  
     public URI getDocumentURI() {
 84  0
         return documentURI;
 85  
     }
 86  
 
 87  
     public final void run(
 88  
             ExtractionParameters extractionParameters,
 89  
             ExtractionContext extractionContext,
 90  
             Document in,
 91  
             ExtractionResult out
 92  
     ) throws IOException, ExtractionException {
 93  0
         this.htmlDocument = new HTMLDocument(in);
 94  0
         this.context      = extractionContext;
 95  0
         this.documentURI  = extractionContext.getDocumentURI();
 96  0
         this.out          = out;
 97  0
         valueFactory.setErrorReporter(out);
 98  
         try {
 99  0
             extract();
 100  
         } finally {
 101  0
             valueFactory.setErrorReporter(null);
 102  0
         }
 103  0
     }
 104  
 
 105  
     /**
 106  
      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
 107  
      * to the extraction session.
 108  
      *
 109  
      * @return a valid extraction result.
 110  
      */
 111  
     protected ExtractionResult getCurrentExtractionResult() {
 112  0
         return out;
 113  
     }
 114  
 
 115  
     protected ExtractionResult openSubResult(ExtractionContext context) {
 116  0
         return out.openSubResult(context);
 117  
     }
 118  
 
 119  
     /**
 120  
      * Helper method that adds a literal property to a subject only if the value of the property
 121  
      * is a valid string.
 122  
      *
 123  
      * @param n the <i>HTML</i> node from which the property value has been extracted.
 124  
      * @param subject the property subject.
 125  
      * @param p the property URI.
 126  
      * @param value the property value.
 127  
      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
 128  
      */
 129  
     protected boolean conditionallyAddStringProperty(
 130  
             Node n,
 131  
             Resource subject, URI p, String value
 132  
     ) {
 133  0
         if (value == null) return false;
 134  0
         value = value.trim();
 135  0
         return
 136  
                 value.length() > 0 
 137  
                         &&
 138  
                 conditionallyAddLiteralProperty(
 139  
                         n,
 140  
                         subject, p, valueFactory.createLiteral(value)
 141  
                 );
 142  
     }
 143  
 
 144  
     /**
 145  
      * Helper method that adds a literal property to a node.
 146  
      *
 147  
      * @param n the <i>HTML</i> node from which the property value has been extracted.
 148  
      * @param subject subject the property subject.
 149  
      * @param property the property URI.
 150  
      * @param literal value the property value.
 151  
      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
 152  
      */
 153  
     protected boolean conditionallyAddLiteralProperty(
 154  
             Node n,
 155  
             Resource subject,
 156  
             URI property,
 157  
             Literal literal
 158  
     ) {
 159  0
         final String literalStr = literal.stringValue();
 160  0
         if( containsScriptBlock(literalStr) ) {
 161  0
             out.notifyError(
 162  
                     ExtractionResult.ErrorLevel.WARN,
 163  
                     String.format("Detected script in literal: [%s]", literalStr)
 164  
                     ,-1
 165  
                     ,-1
 166  
             );
 167  0
             return false;
 168  
         }
 169  0
         out.writeTriple(subject, property, literal);
 170  0
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
 171  0
         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
 172  0
         return true;
 173  
     }
 174  
 
 175  
     /**
 176  
      * Helper method that adds a URI property to a node.
 177  
      * @param subject the property subject.
 178  
      * @param property the property URI.
 179  
      * @param uri the property object.
 180  
      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise. 
 181  
      */
 182  
     protected boolean conditionallyAddResourceProperty(Resource subject, URI property, URI uri) {
 183  0
         if (uri == null) return false;
 184  0
         out.writeTriple(subject, property, uri);
 185  0
         return true;
 186  
     }
 187  
 
 188  
     /**
 189  
      * Helper method that adds a BNode property to a node.
 190  
      *
 191  
      * @param n the <i>HTML</i> node used for extracting such property.
 192  
      * @param subject the property subject.
 193  
      * @param property the property URI.
 194  
      * @param bnode the property value.
 195  
      */
 196  
     protected void addBNodeProperty(Node n, Resource subject, URI property, BNode bnode) {
 197  0
         out.writeTriple(subject, property, bnode);
 198  0
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
 199  0
         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
 200  0
     }
 201  
 
 202  
     /**
 203  
      * Helper method that adds a BNode property to a node.
 204  
      *
 205  
      * @param subject the property subject.
 206  
      * @param property the property URI.
 207  
      * @param bnode the property value.
 208  
      */
 209  
     protected void addBNodeProperty( Resource subject, URI property, BNode bnode) {
 210  0
         out.writeTriple(subject, property, bnode);
 211  0
     }
 212  
 
 213  
     /**
 214  
      * Helper method that adds a URI property to a node.
 215  
      *
 216  
      * @param subject
 217  
      * @param property
 218  
      * @param object
 219  
      */
 220  
     protected void addURIProperty(Resource subject, URI property, URI object) {
 221  0
         out.writeTriple(subject, property, object);    
 222  0
     }
 223  
 
 224  
     protected URI fixLink(String link) {
 225  0
         return valueFactory.fixLink(link, null);
 226  
     }
 227  
 
 228  
     protected URI fixLink(String link, String defaultSchema) {
 229  0
         return valueFactory.fixLink(link, defaultSchema);
 230  
     }
 231  
 
 232  
     private boolean containsScriptBlock(String in) {
 233  0
         final String inLowerCase = in.toLowerCase();
 234  0
         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
 235  0
         if(beginBlock == -1) {
 236  0
             return false;
 237  
         }
 238  0
         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
 239  
     }
 240  
 
 241  
         /**
 242  
      * This method checks if there is a native nesting relationship between two
 243  
      * {@link MicroformatExtractor}.
 244  
      *
 245  
      * @see {@link org.apache.any23.extractor.html.annotations.Includes}
 246  
      * @param including the including {@link MicroformatExtractor}
 247  
      * @param included the included {@link MicroformatExtractor}
 248  
      * @return <code>true</code> if there is a declared nesting relationship
 249  
      */
 250  
     public static boolean includes(
 251  
             Class<? extends MicroformatExtractor>including,
 252  
             Class<? extends MicroformatExtractor> included) {
 253  0
         Includes includes = including.getAnnotation(Includes.class);
 254  0
         if (includes != null) {
 255  0
             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
 256  0
             if (extractors != null && extractors.length > 0) {
 257  0
                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
 258  0
                     if (extractor.equals(included)) {
 259  0
                         return true;
 260  
                     }
 261  
                 }
 262  
             }
 263  
         }
 264  0
         return false;
 265  
     }
 266  
 
 267  
 }