Coverage Report - org.apache.any23.extractor.html.HeadLinkExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HeadLinkExtractor
0%
0/18
0%
0/10
3.5
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionContext;
 21  
 import org.apache.any23.extractor.ExtractionException;
 22  
 import org.apache.any23.extractor.ExtractionParameters;
 23  
 import org.apache.any23.extractor.ExtractionResult;
 24  
 import org.apache.any23.extractor.ExtractorDescription;
 25  
 import org.apache.any23.extractor.ExtractorFactory;
 26  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 27  
 import org.apache.any23.rdf.PopularPrefixes;
 28  
 import org.apache.any23.vocab.XHTML;
 29  
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 30  
 import org.openrdf.model.URI;
 31  
 import org.openrdf.model.ValueFactory;
 32  
 import org.openrdf.model.impl.ValueFactoryImpl;
 33  
 import org.w3c.dom.Document;
 34  
 import org.w3c.dom.Node;
 35  
 
 36  
 import java.io.IOException;
 37  
 import java.util.Arrays;
 38  
 import java.util.List;
 39  
 
 40  
 /**
 41  
  * This {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor} implementation
 42  
  * retrieves the <code>LINK</code>s declared within the <code>HTML/HEAD</code> page header.
 43  
  */
 44  0
 public class HeadLinkExtractor implements TagSoupDOMExtractor {
 45  
 
 46  
     public void run(
 47  
             ExtractionParameters extractionParameters,
 48  
             ExtractionContext extractionContext,
 49  
             Document in,
 50  
             ExtractionResult out
 51  
     ) throws IOException, ExtractionException {
 52  0
         HTMLDocument html = new HTMLDocument(in);
 53  0
         ValueFactory vf = ValueFactoryImpl.getInstance();
 54  
 
 55  0
         final List<Node> headLinkNodes = DomUtils.findAll(
 56  
                 in,
 57  
                 "/HTML/HEAD/LINK[(" +
 58  
                         "@type='application/rdf+xml' or " +
 59  
                         "@type='text/rdf' or " +
 60  
                         "@type='application/x-turtle' or " +
 61  
                         "@type='application/turtle' or " +
 62  
                         "@type='text/turtle' or " +
 63  
                         "@type='text/rdf+n3'" +
 64  
                         ") and @href and @rel]"
 65  
         );
 66  0
         for (Node node : headLinkNodes) {
 67  0
             final URI href = html.resolveURI(DomUtils.find(node, "@href"));
 68  0
             final String rel = DomUtils.find(node, "@rel");
 69  0
             out.writeTriple(
 70  
                     extractionContext.getDocumentURI(),
 71  
                     vf.createURI(XHTML.NS + rel),
 72  
                     href
 73  
             );
 74  0
             final String title = DomUtils.find(node, "@title");
 75  0
             if (title != null && !"".equals(title)) {
 76  0
                 out.writeTriple(
 77  
                         href,
 78  
                         factory.getPrefixes().expand("dcterms:title"),
 79  
                         vf.createLiteral(title)
 80  
                 );
 81  
             }
 82  0
             final String type = DomUtils.find(node, "@type");
 83  0
             if (type != null && !"".equals(type)) {
 84  0
                 out.writeTriple(
 85  
                         href,
 86  
                         factory.getPrefixes().expand("dcterms:format"),
 87  
                         vf.createLiteral(type)
 88  
                 );
 89  
             }
 90  0
         }
 91  0
     }
 92  
 
 93  
     public ExtractorDescription getDescription() {
 94  0
         return factory;
 95  
     }
 96  
 
 97  0
     public final static ExtractorFactory<HeadLinkExtractor> factory =
 98  
             SimpleExtractorFactory.create(
 99  
                     "html-head-links",
 100  
                     PopularPrefixes.createSubset("xhtml", "dcterms"),
 101  
                     Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
 102  
                     "example-head-link.html",
 103  
                     HeadLinkExtractor.class);
 104  
 }