Coverage Report

Coverage Report - org.apache.any23.extractor.html.HTMLMetaExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

HTMLMetaExtractor

0/60

0/30

2.588

HTMLMetaExtractor$Meta

0/23

0/14

2.588

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.rdf.PopularPrefixes;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
 import org.openrdf.model.URI;
 import org.openrdf.model.impl.LiteralImpl;
 import org.openrdf.model.impl.URIImpl;
 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
 /**
  * This extractor represents the <i>HTML META</i> tag values
  * according the <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
  *
  * @author Davide Palmisano ( dpalmisano@gmail.com )
  */
 public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
 
     public static final String NAME = "html-head-meta";
 
     private static final SINDICE vSINDICE = SINDICE.getInstance();
 
     private URI profile;
 
     private Map<String, URI> prefixes = new HashMap<String, URI>();
 
     private String documentLang;
 
     /**
      * {@inheritDoc}
      */
     public final static ExtractorFactory<HTMLMetaExtractor> factory =
             SimpleExtractorFactory.create(
                     NAME,
                     PopularPrefixes.createSubset("sindice"),
                     Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
                     "example-meta.html",
                     HTMLMetaExtractor.class
             );
 
     /**
      * {@inheritDoc}
      */
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
         profile = extractProfile(in);
         documentLang = getDocumentLanguage(in);
         extractLinkDefinedPrefixes(in);
 
         String baseProfile = vSINDICE.NS;
         if(profile != null) {
             baseProfile = profile.toString();
         }
 
         final URI documentURI = extractionContext.getDocumentURI();
         Set<Meta> metas = extractMetaElement(in, baseProfile);
         for(Meta meta : metas) {
             String lang = documentLang;
             if(meta.getLang() != null) {
                 lang = meta.getLang();
             }
             out.writeTriple(
                     documentURI,
                     meta.getName(),
                     new LiteralImpl(meta.getContent(), lang)
             );
         }
     }
 
     /**
      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
      *
      * @param in a instance of {@link Document}.
      * @return the language declared, could be <code>null</code>.
      */
     private String getDocumentLanguage(Document in) {
         String lang = DomUtils.find(in, "string(/HTML/@lang)");
         if (lang.equals("")) {
             return null;
         }
         return lang;
     }
 
     private URI extractProfile(Document in) {
         String profile = DomUtils.find(in, "string(/HTML/@profile)");
         if (profile.equals("")) {
             return null;
         }
         return new URIImpl(profile);
     }
 
     /**
      * It extracts prefixes defined in the <i>LINK</i> meta tags.
      *
      * @param in
      */
     private void extractLinkDefinedPrefixes(Document in) {
         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
         for(Node linkNode : linkNodes) {
             NamedNodeMap attributes = linkNode.getAttributes();
             String rel = attributes.getNamedItem("rel").getTextContent();
             String href = attributes.getNamedItem("href").getTextContent();
             if(rel != null && href !=null && RDFUtils.isAbsoluteURI(href)) {
                 prefixes.put(rel, new URIImpl(href));
             }
         }
     }
 
     private Set<Meta> extractMetaElement(Document in, String baseProfile) {
         List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
         Set<Meta> result = new HashSet<Meta>();
         for (Node metaNode : metaNodes) {
             NamedNodeMap attributes = metaNode.getAttributes();
             Node nameAttribute = attributes.getNamedItem("name");
             Node contentAttribute = attributes.getNamedItem("content");
             if (nameAttribute == null || contentAttribute == null) {
                 continue;
             }
             String name = nameAttribute.getTextContent();
             String content = contentAttribute.getTextContent();
             String xpath = DomUtils.getXPathForNode(metaNode);
             URI nameAsURI = getPrefixIfExists(name);
             if (nameAsURI == null) {
                 nameAsURI = new URIImpl(baseProfile + name);
             }
             Meta meta = new Meta(xpath, nameAsURI, content);
             result.add(meta);
         }
         return result;
     }
 
     private URI getPrefixIfExists(String name) {
         String[] split = name.split("\\.");
         if(split.length == 2 && prefixes.containsKey(split[0])) {
             return new URIImpl(prefixes.get(split[0]) + split[1]);
         }
         return null;
     }
 
     public ExtractorDescription getDescription() {
         return factory;
     }
 
     private class Meta {
 
         private String xpath;
 
         private URI name;
 
         private String lang;
 
         private String content;
 
         public Meta(String xpath, URI name, String content) {
             this.xpath = xpath;
             this.name = name;
             this.content = content;
         }
 
         public Meta(String xpath, URI name, String content, String lang) {
             this(xpath, name, content);
             this.lang = lang;
         }
 
         public URI getName() {
             return name;
         }
 
         public void setName(URI name) {
             this.name = name;
         }
 
         public String getLang() {
             return lang;
         }
 
         public void setLang(String lang) {
             this.lang = lang;
         }
 
         public String getContent() {
             return content;
         }
 
         public void setContent(String content) {
             this.content = content;
         }
 
         @Override
         public boolean equals(Object o) {
             if (this == o) return true;
             if (o == null || getClass() != o.getClass()) return false;
 
             Meta meta = (Meta) o;
 
             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false;
 
             return true;
         }
 
         @Override
         public int hashCode() {
             return xpath != null ? xpath.hashCode() : 0;
         }
     }
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionContext;
21		import org.apache.any23.extractor.ExtractionException;
22		import org.apache.any23.extractor.ExtractionParameters;
23		import org.apache.any23.extractor.ExtractionResult;
24		import org.apache.any23.extractor.Extractor;
25		import org.apache.any23.extractor.ExtractorDescription;
26		import org.apache.any23.extractor.ExtractorFactory;
27		import org.apache.any23.extractor.SimpleExtractorFactory;
28		import org.apache.any23.rdf.PopularPrefixes;
29		import org.apache.any23.rdf.RDFUtils;
30		import org.apache.any23.vocab.SINDICE;
31		import org.openrdf.model.URI;
32		import org.openrdf.model.impl.LiteralImpl;
33		import org.openrdf.model.impl.URIImpl;
34		import org.w3c.dom.Document;
35		import org.w3c.dom.NamedNodeMap;
36		import org.w3c.dom.Node;
37
38		import java.io.IOException;
39		import java.util.Arrays;
40		import java.util.HashMap;
41		import java.util.HashSet;
42		import java.util.List;
43		import java.util.Map;
44		import java.util.Set;
45
46		/**
47		* This extractor represents the <i>HTML META</i> tag values
48		* according the <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
49		*
50		* @author Davide Palmisano ( dpalmisano@gmail.com )
51		*/
52	0	public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
53
54		public static final String NAME = "html-head-meta";
55
56	0	private static final SINDICE vSINDICE = SINDICE.getInstance();
57
58		private URI profile;
59
60	0	private Map<String, URI> prefixes = new HashMap<String, URI>();
61
62		private String documentLang;
63
64		/**
65		* {@inheritDoc}
66		*/
67	0	public final static ExtractorFactory<HTMLMetaExtractor> factory =
68		SimpleExtractorFactory.create(
69		NAME,
70		PopularPrefixes.createSubset("sindice"),
71		Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
72		"example-meta.html",
73		HTMLMetaExtractor.class
74		);
75
76		/**
77		* {@inheritDoc}
78		*/
79		public void run(
80		ExtractionParameters extractionParameters,
81		ExtractionContext extractionContext,
82		Document in,
83		ExtractionResult out
84		) throws IOException, ExtractionException {
85	0	profile = extractProfile(in);
86	0	documentLang = getDocumentLanguage(in);
87	0	extractLinkDefinedPrefixes(in);
88
89	0	String baseProfile = vSINDICE.NS;
90	0	if(profile != null) {
91	0	baseProfile = profile.toString();
92		}
93
94	0	final URI documentURI = extractionContext.getDocumentURI();
95	0	Set<Meta> metas = extractMetaElement(in, baseProfile);
96	0	for(Meta meta : metas) {
97	0	String lang = documentLang;
98	0	if(meta.getLang() != null) {
99	0	lang = meta.getLang();
100		}
101	0	out.writeTriple(
102		documentURI,
103		meta.getName(),
104		new LiteralImpl(meta.getContent(), lang)
105		);
106	0	}
107	0	}
108
109		/**
110		* Returns the {@link Document} language if declared, <code>null</code> otherwise.
111		*
112		* @param in a instance of {@link Document}.
113		* @return the language declared, could be <code>null</code>.
114		*/
115		private String getDocumentLanguage(Document in) {
116	0	String lang = DomUtils.find(in, "string(/HTML/@lang)");
117	0	if (lang.equals("")) {
118	0	return null;
119		}
120	0	return lang;
121		}
122
123		private URI extractProfile(Document in) {
124	0	String profile = DomUtils.find(in, "string(/HTML/@profile)");
125	0	if (profile.equals("")) {
126	0	return null;
127		}
128	0	return new URIImpl(profile);
129		}
130
131		/**
132		* It extracts prefixes defined in the <i>LINK</i> meta tags.
133		*
134		* @param in
135		*/
136		private void extractLinkDefinedPrefixes(Document in) {
137	0	List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
138	0	for(Node linkNode : linkNodes) {
139	0	NamedNodeMap attributes = linkNode.getAttributes();
140	0	String rel = attributes.getNamedItem("rel").getTextContent();
141	0	String href = attributes.getNamedItem("href").getTextContent();
142	0	if(rel != null && href !=null && RDFUtils.isAbsoluteURI(href)) {
143	0	prefixes.put(rel, new URIImpl(href));
144		}
145	0	}
146	0	}
147
148		private Set<Meta> extractMetaElement(Document in, String baseProfile) {
149	0	List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
150	0	Set<Meta> result = new HashSet<Meta>();
151	0	for (Node metaNode : metaNodes) {
152	0	NamedNodeMap attributes = metaNode.getAttributes();
153	0	Node nameAttribute = attributes.getNamedItem("name");
154	0	Node contentAttribute = attributes.getNamedItem("content");
155	0	if (nameAttribute == null \|\| contentAttribute == null) {
156	0	continue;
157		}
158	0	String name = nameAttribute.getTextContent();
159	0	String content = contentAttribute.getTextContent();
160	0	String xpath = DomUtils.getXPathForNode(metaNode);
161	0	URI nameAsURI = getPrefixIfExists(name);
162	0	if (nameAsURI == null) {
163	0	nameAsURI = new URIImpl(baseProfile + name);
164		}
165	0	Meta meta = new Meta(xpath, nameAsURI, content);
166	0	result.add(meta);
167	0	}
168	0	return result;
169		}
170
171		private URI getPrefixIfExists(String name) {
172	0	String[] split = name.split("\\.");
173	0	if(split.length == 2 && prefixes.containsKey(split[0])) {
174	0	return new URIImpl(prefixes.get(split[0]) + split[1]);
175		}
176	0	return null;
177		}
178
179		public ExtractorDescription getDescription() {
180	0	return factory;
181		}
182
183	0	private class Meta {
184
185		private String xpath;
186
187		private URI name;
188
189		private String lang;
190
191		private String content;
192
193	0	public Meta(String xpath, URI name, String content) {
194	0	this.xpath = xpath;
195	0	this.name = name;
196	0	this.content = content;
197	0	}
198
199		public Meta(String xpath, URI name, String content, String lang) {
200	0	this(xpath, name, content);
201	0	this.lang = lang;
202	0	}
203
204		public URI getName() {
205	0	return name;
206		}
207
208		public void setName(URI name) {
209	0	this.name = name;
210	0	}
211
212		public String getLang() {
213	0	return lang;
214		}
215
216		public void setLang(String lang) {
217	0	this.lang = lang;
218	0	}
219
220		public String getContent() {
221	0	return content;
222		}
223
224		public void setContent(String content) {
225	0	this.content = content;
226	0	}
227
228		@Override
229		public boolean equals(Object o) {
230	0	if (this == o) return true;
231	0	if (o == null \|\| getClass() != o.getClass()) return false;
232
233	0	Meta meta = (Meta) o;
234
235	0	if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false;
236
237	0	return true;
238		}
239
240		@Override
241		public int hashCode() {
242	0	return xpath != null ? xpath.hashCode() : 0;
243		}
244		}
245
246		}