Coverage Report - org.apache.any23.extractor.html.HTMLMetaExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HTMLMetaExtractor
0%
0/60
0%
0/30
2.588
HTMLMetaExtractor$Meta
0%
0/23
0%
0/14
2.588
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionContext;
 21  
 import org.apache.any23.extractor.ExtractionException;
 22  
 import org.apache.any23.extractor.ExtractionParameters;
 23  
 import org.apache.any23.extractor.ExtractionResult;
 24  
 import org.apache.any23.extractor.Extractor;
 25  
 import org.apache.any23.extractor.ExtractorDescription;
 26  
 import org.apache.any23.extractor.ExtractorFactory;
 27  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 28  
 import org.apache.any23.rdf.PopularPrefixes;
 29  
 import org.apache.any23.rdf.RDFUtils;
 30  
 import org.apache.any23.vocab.SINDICE;
 31  
 import org.openrdf.model.URI;
 32  
 import org.openrdf.model.impl.LiteralImpl;
 33  
 import org.openrdf.model.impl.URIImpl;
 34  
 import org.w3c.dom.Document;
 35  
 import org.w3c.dom.NamedNodeMap;
 36  
 import org.w3c.dom.Node;
 37  
 
 38  
 import java.io.IOException;
 39  
 import java.util.Arrays;
 40  
 import java.util.HashMap;
 41  
 import java.util.HashSet;
 42  
 import java.util.List;
 43  
 import java.util.Map;
 44  
 import java.util.Set;
 45  
 
 46  
 /**
 47  
  * This extractor represents the <i>HTML META</i> tag values
 48  
  * according the <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
 49  
  *
 50  
  * @author Davide Palmisano ( dpalmisano@gmail.com )
 51  
  */
 52  0
 public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
 53  
 
 54  
     public static final String NAME = "html-head-meta";
 55  
 
 56  0
     private static final SINDICE vSINDICE = SINDICE.getInstance();
 57  
 
 58  
     private URI profile;
 59  
 
 60  0
     private Map<String, URI> prefixes = new HashMap<String, URI>();
 61  
 
 62  
     private String documentLang;
 63  
 
 64  
     /**
 65  
      * {@inheritDoc}
 66  
      */
 67  0
     public final static ExtractorFactory<HTMLMetaExtractor> factory =
 68  
             SimpleExtractorFactory.create(
 69  
                     NAME,
 70  
                     PopularPrefixes.createSubset("sindice"),
 71  
                     Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
 72  
                     "example-meta.html",
 73  
                     HTMLMetaExtractor.class
 74  
             );
 75  
 
 76  
     /**
 77  
      * {@inheritDoc}
 78  
      */
 79  
     public void run(
 80  
             ExtractionParameters extractionParameters,
 81  
             ExtractionContext extractionContext,
 82  
             Document in,
 83  
             ExtractionResult out
 84  
     ) throws IOException, ExtractionException {
 85  0
         profile = extractProfile(in);
 86  0
         documentLang = getDocumentLanguage(in);
 87  0
         extractLinkDefinedPrefixes(in);
 88  
 
 89  0
         String baseProfile = vSINDICE.NS;
 90  0
         if(profile != null) {
 91  0
             baseProfile = profile.toString();
 92  
         }
 93  
 
 94  0
         final URI documentURI = extractionContext.getDocumentURI();
 95  0
         Set<Meta> metas = extractMetaElement(in, baseProfile);
 96  0
         for(Meta meta : metas) {
 97  0
             String lang = documentLang;
 98  0
             if(meta.getLang() != null) {
 99  0
                 lang = meta.getLang();
 100  
             }
 101  0
             out.writeTriple(
 102  
                     documentURI,
 103  
                     meta.getName(),
 104  
                     new LiteralImpl(meta.getContent(), lang)
 105  
             );
 106  0
         }
 107  0
     }
 108  
 
 109  
     /**
 110  
      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
 111  
      *
 112  
      * @param in a instance of {@link Document}.
 113  
      * @return the language declared, could be <code>null</code>.
 114  
      */
 115  
     private String getDocumentLanguage(Document in) {
 116  0
         String lang = DomUtils.find(in, "string(/HTML/@lang)");
 117  0
         if (lang.equals("")) {
 118  0
             return null;
 119  
         }
 120  0
         return lang;
 121  
     }
 122  
 
 123  
     private URI extractProfile(Document in) {
 124  0
         String profile = DomUtils.find(in, "string(/HTML/@profile)");
 125  0
         if (profile.equals("")) {
 126  0
             return null;
 127  
         }
 128  0
         return new URIImpl(profile);
 129  
     }
 130  
 
 131  
     /**
 132  
      * It extracts prefixes defined in the <i>LINK</i> meta tags.
 133  
      *
 134  
      * @param in
 135  
      */
 136  
     private void extractLinkDefinedPrefixes(Document in) {
 137  0
         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
 138  0
         for(Node linkNode : linkNodes) {
 139  0
             NamedNodeMap attributes = linkNode.getAttributes();
 140  0
             String rel = attributes.getNamedItem("rel").getTextContent();
 141  0
             String href = attributes.getNamedItem("href").getTextContent();
 142  0
             if(rel != null && href !=null && RDFUtils.isAbsoluteURI(href)) {
 143  0
                 prefixes.put(rel, new URIImpl(href));
 144  
             }
 145  0
         }
 146  0
     }
 147  
 
 148  
     private Set<Meta> extractMetaElement(Document in, String baseProfile) {
 149  0
         List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
 150  0
         Set<Meta> result = new HashSet<Meta>();
 151  0
         for (Node metaNode : metaNodes) {
 152  0
             NamedNodeMap attributes = metaNode.getAttributes();
 153  0
             Node nameAttribute = attributes.getNamedItem("name");
 154  0
             Node contentAttribute = attributes.getNamedItem("content");
 155  0
             if (nameAttribute == null || contentAttribute == null) {
 156  0
                 continue;
 157  
             }
 158  0
             String name = nameAttribute.getTextContent();
 159  0
             String content = contentAttribute.getTextContent();
 160  0
             String xpath = DomUtils.getXPathForNode(metaNode);
 161  0
             URI nameAsURI = getPrefixIfExists(name);
 162  0
             if (nameAsURI == null) {
 163  0
                 nameAsURI = new URIImpl(baseProfile + name);
 164  
             }
 165  0
             Meta meta = new Meta(xpath, nameAsURI, content);
 166  0
             result.add(meta);
 167  0
         }
 168  0
         return result;
 169  
     }
 170  
 
 171  
     private URI getPrefixIfExists(String name) {
 172  0
         String[] split = name.split("\\.");
 173  0
         if(split.length == 2 && prefixes.containsKey(split[0])) {
 174  0
             return new URIImpl(prefixes.get(split[0]) + split[1]);
 175  
         }
 176  0
         return null;
 177  
     }
 178  
 
 179  
     public ExtractorDescription getDescription() {
 180  0
         return factory;
 181  
     }
 182  
 
 183  0
     private class Meta {
 184  
 
 185  
         private String xpath;
 186  
 
 187  
         private URI name;
 188  
 
 189  
         private String lang;
 190  
 
 191  
         private String content;
 192  
 
 193  0
         public Meta(String xpath, URI name, String content) {
 194  0
             this.xpath = xpath;
 195  0
             this.name = name;
 196  0
             this.content = content;
 197  0
         }
 198  
 
 199  
         public Meta(String xpath, URI name, String content, String lang) {
 200  0
             this(xpath, name, content);
 201  0
             this.lang = lang;
 202  0
         }
 203  
 
 204  
         public URI getName() {
 205  0
             return name;
 206  
         }
 207  
 
 208  
         public void setName(URI name) {
 209  0
             this.name = name;
 210  0
         }
 211  
 
 212  
         public String getLang() {
 213  0
             return lang;
 214  
         }
 215  
 
 216  
         public void setLang(String lang) {
 217  0
             this.lang = lang;
 218  0
         }
 219  
 
 220  
         public String getContent() {
 221  0
             return content;
 222  
         }
 223  
 
 224  
         public void setContent(String content) {
 225  0
             this.content = content;
 226  0
         }
 227  
 
 228  
         @Override
 229  
         public boolean equals(Object o) {
 230  0
             if (this == o) return true;
 231  0
             if (o == null || getClass() != o.getClass()) return false;
 232  
 
 233  0
             Meta meta = (Meta) o;
 234  
 
 235  0
             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false;
 236  
 
 237  0
             return true;
 238  
         }
 239  
 
 240  
         @Override
 241  
         public int hashCode() {
 242  0
             return xpath != null ? xpath.hashCode() : 0;
 243  
         }
 244  
     }
 245  
 
 246  
 }