Coverage Report

Coverage Report - org.apache.any23.extractor.microdata.MicrodataExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

MicrodataExtractor

0/171

0/102

5.059

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.microdata;
 
 import org.apache.any23.extractor.ErrorReporter;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.extractor.html.DomUtils;
 import org.apache.any23.rdf.PopularPrefixes;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.DCTERMS;
 import org.apache.any23.vocab.XHTML;
 import org.openrdf.model.Literal;
 import org.openrdf.model.Resource;
 import org.openrdf.model.URI;
 import org.openrdf.model.Value;
 import org.openrdf.model.vocabulary.RDF;
 import org.openrdf.model.vocabulary.XMLSchema;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
 /**
  * Default implementation of <a href="http://www.w3.org/TR/microdata/">Microdata</a> extractor,
  * based on {@link TagSoupDOMExtractor}.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
  * @author Davide Palmisano ( dpalmisano@gmail.com )
  */
 public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
 
     private static final URI MICRODATA_ITEM
             = RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
 
     public final static ExtractorFactory<MicrodataExtractor> factory =
             SimpleExtractorFactory.create(
                     "html-microdata",
                     PopularPrefixes.createSubset("rdf", "doac", "foaf"),
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
                     "example-microdata.html",
                     MicrodataExtractor.class
             );
 
     private String documentLanguage;
 
     private boolean isStrict;
 
     private String defaultNamespace;
 
     public ExtractorDescription getDescription() {
         return factory;
     }
 
     /**
      * This extraction performs the
      * <a href="http://www.w3.org/TR/microdata/#rdf">Microdata to RDF conversion algorithm</a>.
      * A slight modification of the specification algorithm has been introduced
      * to avoid performing actions 5.2.1, 5.2.2, 5.2.3, 5.2.4 if step 5.2.6 doesn't detect any
      * Microdata.
      */
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
 
         final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
         if(parserReport.getErrors().length > 0) {
             notifyError(parserReport.getErrors(), out);
         }
         final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
         if (itemScopes.length == 0) {
             return;
         }
 
         isStrict = extractionParameters.getFlag("any23.microdata.strict");
         if (!isStrict) {
             defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
         }
 
         documentLanguage = getDocumentLanguage(in);
 
         /**
          * 5.2.6
          */
         final URI documentURI = extractionContext.getDocumentURI();
         final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
         for (ItemScope itemScope : itemScopes) {
             Resource subject = processType(itemScope, documentURI, out, mappings);
             out.writeTriple(
                     documentURI,
                     MICRODATA_ITEM,
                     subject
             );
         }
 
         /**
          * 5.2.1
          */
         processTitle(in, documentURI, out);
         /**
          * 5.2.2
          */
         processHREFElements(in, documentURI, out);
         /**
          * 5.2.3
          */
         processMetaElements(in, documentURI, out);
 
         /**
          * 5.2.4
          */
         processCiteElements(in, documentURI, out);
     }
 
     /**
      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
      *
      * @param in a instance of {@link Document}.
      * @return the language declared, could be <code>null</code>.
      */
     private String getDocumentLanguage(Document in) {
         String lang = DomUtils.find(in, "string(/HTML/@lang)");
         if (lang.equals("")) {
             return null;
         }
         return lang;
     }
 
     /**
      * Returns the {@link Node} language if declared, or the {@link Document} one
      * if not defined.
      *
      * @param node a {@link Node} instance.
      * @return the {@link Node} language or the {@link Document} one. Could be <code>null</code>
      */
     private String getLanguage(Node node) {
         Node nodeLang = node.getAttributes().getNamedItem("lang");
         if (nodeLang == null) {
             // if the element does not specify a lang, use the document one
             return documentLanguage;
         }
         return nodeLang.getTextContent();
     }
 
     /**
      * Implements step 5.2.1 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param in          {@link Document} to be processed.
      * @param documentURI Document current {@link URI}.
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
      */
     private void processTitle(Document in, URI documentURI, ExtractionResult out) {
         NodeList titles = in.getElementsByTagName("title");
         // just one title is allowed.
         if (titles.getLength() == 1) {
             Node title = titles.item(0);
             String titleValue = title.getTextContent();
             Literal object;
             String lang = getLanguage(title);
             if (lang == null) {
                 // unable to decide the language, leave it unknown
                 object = RDFUtils.literal(titleValue);
             } else {
                 object = RDFUtils.literal(titleValue, lang);
             }
             out.writeTriple(
                     documentURI,
                     DCTERMS.getInstance().title,
                     object
             );
         }
     }
 
     /**
      * Implements step 5.2.2 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param in          {@link Document} to be processed.
      * @param documentURI Document current {@link URI}.
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
      */
     private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
         NodeList anchors = in.getElementsByTagName("a");
         for (int i = 0; i < anchors.getLength(); i++) {
             processHREFElement(anchors.item(i), documentURI, out);
         }
         NodeList areas = in.getElementsByTagName("area");
         for (int i = 0; i < areas.getLength(); i++) {
             processHREFElement(areas.item(i), documentURI, out);
         }
         NodeList links = in.getElementsByTagName("link");
         for (int i = 0; i < links.getLength(); i++) {
             processHREFElement(links.item(i), documentURI, out);
         }
     }
 
     /**
      * Implements sub-step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param item        {@link Node} to be processed.
      * @param documentURI Document current {@link URI}.
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
      */
     private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
         Node rel = item.getAttributes().getNamedItem("rel");
         if (rel == null) {
             return;
         }
         Node href = item.getAttributes().getNamedItem("href");
         if (href == null) {
             return;
         }
         URL absoluteURL;
         if (!isAbsoluteURL(href.getTextContent())) {
             try {
                 absoluteURL = toAbsoluteURL(
                         documentURI.toString(),
                         href.getTextContent(),
                         '/'
                 );
             } catch (MalformedURLException e) {
                 // okay, it's not an absolute URL, return
                 return;
             }
         } else {
             try {
                 absoluteURL = new URL(href.getTextContent());
             } catch (MalformedURLException e) {
                 // cannot happen
                 return;
             }
         }
         String[] relTokens = rel.getTextContent().split(" ");
         Set<String> tokensWithNoDuplicates = new HashSet<String>();
         for (String relToken : relTokens) {
             if (relToken.contains(":")) {
                 // if contain semi-colon, skip
                 continue;
             }
             if (relToken.equals("alternate") || relToken.equals("stylesheet")) {
                 tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
                 continue;
             }
             tokensWithNoDuplicates.add(relToken.toLowerCase());
         }
         for (String token : tokensWithNoDuplicates) {
             URI predicate;
             if (isAbsoluteURL(token)) {
                 predicate = RDFUtils.uri(token);
             } else {
                 predicate = RDFUtils.uri(XHTML.NS + token);
             }
             out.writeTriple(
                     documentURI,
                     predicate,
                     RDFUtils.uri(absoluteURL.toString())
             );
         }
     }
 
     /**
      * Implements step 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param in          {@link Document} to be processed.
      * @param documentURI Document current {@link URI}.
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
      */
     private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
         NodeList metas = in.getElementsByTagName("meta");
         for (int i = 0; i < metas.getLength(); i++) {
             Node meta = metas.item(i);
             String name    = DomUtils.readAttribute(meta, "name"   , null);
             String content = DomUtils.readAttribute(meta, "content", null);
             if (name != null && content != null) {
                 if (isAbsoluteURL(name)) {
                     processMetaElement(
                             RDFUtils.uri(name),
                             content,
                             getLanguage(meta),
                             documentURI,
                             out
                     );
                 } else {
                     processMetaElement(
                             name,
                             content,
                             getLanguage(meta),
                             documentURI,
                             out
                     );
                 }
             }
         }
     }
 
     /**
      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param uri
      * @param content
      * @param language
      * @param documentURI
      * @param out
      */
     private void processMetaElement(
             URI uri,
             String content,
             String language,
             URI documentURI,
             ExtractionResult out
     ) {
         if (content.contains(":")) {
             // if it contains U+003A COLON, exit
             return;
         }
         Literal subject;
         if (language == null) {
             // ok, we don't know the language
             subject = RDFUtils.literal(content);
         } else {
             subject = RDFUtils.literal(content, language);
         }
         out.writeTriple(
                 documentURI,
                 uri,
                 subject
         );
     }
 
     /**
      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param name
      * @param content
      * @param language
      * @param documentURI
      * @param out
      */
     private void processMetaElement(
             String name,
             String content,
             String language,
             URI documentURI,
             ExtractionResult out) {
         Literal subject;
         if (language == null) {
             // ok, we don't know the language
             subject = RDFUtils.literal(content);
         } else {
             subject = RDFUtils.literal(content, language);
         }
         out.writeTriple(
                 documentURI,
                 RDFUtils.uri(XHTML.NS + name.toLowerCase()),
                 subject
         );
     }
 
     /**
      * Implements sub step for 5.2.4 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param in
      * @param documentURI
      * @param out
      */
     private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
         NodeList blockQuotes = in.getElementsByTagName("blockquote");
         for (int i = 0; i < blockQuotes.getLength(); i++) {
             processCiteElement(blockQuotes.item(i), documentURI, out);
         }
         NodeList quotes = in.getElementsByTagName("q");
         for (int i = 0; i < quotes.getLength(); i++) {
             processCiteElement(quotes.item(i), documentURI, out);
         }
     }
 
     private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
         if (item.getAttributes().getNamedItem("cite") != null) {
             out.writeTriple(
                     documentURI,
                     DCTERMS.getInstance().source,
                     RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
             );
         }
     }
 
     /**
      * Recursive method implementing 5.2.6.1 "generate the triple for the item" of
      * <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
      * extraction algorithm.
      *
      * @param itemScope
      * @param documentURI
      * @param out
      * @param mappings
      * @return
      * @throws ExtractionException
      */
     private Resource processType(
             ItemScope itemScope,
             URI documentURI, ExtractionResult out,
             Map<ItemScope, Resource> mappings
     ) throws ExtractionException {
         Resource subject;
         if (mappings.containsKey(itemScope)) {
             subject = mappings.get(itemScope);
         } else if (isAbsoluteURL(itemScope.getItemId())) {
             subject = RDFUtils.uri(itemScope.getItemId());
         } else {
             subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
         }
         mappings.put(itemScope, subject);
 
         // ItemScope.type could be null, but surely it's a valid URL
         String itemScopeType = "";
         if (itemScope.getType() != null) {
             String itemType;
             itemType = itemScope.getType().toString();
             out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
             itemScopeType = itemScope.getType().toString();
         }
         for (String propName : itemScope.getProperties().keySet()) {
             List<ItemProp> itemProps = itemScope.getProperties().get(propName);
             for (ItemProp itemProp : itemProps) {
                 try {
                     processProperty(
                             subject,
                             propName,
                             itemProp,
                             itemScopeType,
                             documentURI,
                             mappings,
                             out
                     );
                 } catch (MalformedURLException e) {
                     throw new ExtractionException(
                             "Error while processing on subject '" + subject +
                                     "' the itemProp: '" + itemProp + "' "
                     );
                 }
             }
         }
         return subject;
     }
 
     private void processProperty(
             Resource subject,
             String propName,
             ItemProp itemProp,
             String itemScopeType,
             URI documentURI,
             Map<ItemScope, Resource> mappings,
             ExtractionResult out
     ) throws MalformedURLException, ExtractionException {
         URI predicate;
         if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
             return;
         } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
             predicate = RDFUtils.uri(
                     toAbsoluteURL(
                             defaultNamespace,
                             propName,
                             '/'
                     ).toString()
             );
         } else {
             predicate = RDFUtils.uri(
                     toAbsoluteURL(
                             itemScopeType,
                             propName,
                             '/'
                     ).toString());
         }
         Value value;
         Object propValue = itemProp.getValue().getContent();
         ItemPropValue.Type propType = itemProp.getValue().getType();
         if (propType.equals(ItemPropValue.Type.Nested)) {
             value = processType((ItemScope) propValue, documentURI, out, mappings);
         } else if (propType.equals(ItemPropValue.Type.Plain)) {
             value = RDFUtils.literal((String) propValue, documentLanguage);
         } else if (propType.equals(ItemPropValue.Type.Link)) {
             value = RDFUtils.uri(
                     toAbsoluteURL(
                             documentURI.toString(),
                             (String) propValue,
                             '/'
                     ).toString()
             );
         } else if (propType.equals(ItemPropValue.Type.Date)) {
             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
         } else {
             throw new RuntimeException("Invalid Type '" +
                     propType + "' for ItemPropValue with name: '" + propName + "'");
         }
         out.writeTriple(subject, predicate, value);
     }
 
     private boolean isAbsoluteURL(String urlString) {
         boolean result = false;
         try {
             URL url = new URL(urlString);
             String protocol = url.getProtocol();
             if (protocol != null && protocol.trim().length() > 0)
                 result = true;
         } catch (MalformedURLException e) {
             return false;
         }
         return result;
     }
 
     private URL toAbsoluteURL(String ns, String part, char trailing)
             throws MalformedURLException {
         if (isAbsoluteURL(part)) {
             return new URL(part);
         }
         char lastChar = ns.charAt(ns.length() - 1);
         if (lastChar == '#' || lastChar == '/')
             return new URL(ns + part);
         return new URL(ns + trailing + part);
     }
 
     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
         for(MicrodataParserException mpe : errors) {
             out.notifyError(
                     ErrorReporter.ErrorLevel.ERROR,
                     mpe.toJSON(),
                     mpe.getErrorLocationBeginRow() ,
                     mpe.getErrorLocationBeginCol()
             );
         }
     }
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.microdata;
19
20		import org.apache.any23.extractor.ErrorReporter;
21		import org.apache.any23.extractor.ExtractionContext;
22		import org.apache.any23.extractor.ExtractionException;
23		import org.apache.any23.extractor.ExtractionParameters;
24		import org.apache.any23.extractor.ExtractionResult;
25		import org.apache.any23.extractor.Extractor;
26		import org.apache.any23.extractor.ExtractorDescription;
27		import org.apache.any23.extractor.ExtractorFactory;
28		import org.apache.any23.extractor.SimpleExtractorFactory;
29		import org.apache.any23.extractor.html.DomUtils;
30		import org.apache.any23.rdf.PopularPrefixes;
31		import org.apache.any23.rdf.RDFUtils;
32		import org.apache.any23.vocab.DCTERMS;
33		import org.apache.any23.vocab.XHTML;
34		import org.openrdf.model.Literal;
35		import org.openrdf.model.Resource;
36		import org.openrdf.model.URI;
37		import org.openrdf.model.Value;
38		import org.openrdf.model.vocabulary.RDF;
39		import org.openrdf.model.vocabulary.XMLSchema;
40		import org.w3c.dom.Document;
41		import org.w3c.dom.Node;
42		import org.w3c.dom.NodeList;
43
44		import java.io.IOException;
45		import java.net.MalformedURLException;
46		import java.net.URL;
47		import java.util.Arrays;
48		import java.util.Date;
49		import java.util.HashMap;
50		import java.util.HashSet;
51		import java.util.List;
52		import java.util.Map;
53		import java.util.Set;
54
55		/**
56		* Default implementation of <a href="http://www.w3.org/TR/microdata/">Microdata</a> extractor,
57		* based on {@link TagSoupDOMExtractor}.
58		*
59		* @author Michele Mostarda (mostarda@fbk.eu)
60		* @author Davide Palmisano ( dpalmisano@gmail.com )
61		*/
62	0	public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
63
64	0	private static final URI MICRODATA_ITEM
65		= RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
66
67	0	public final static ExtractorFactory<MicrodataExtractor> factory =
68		SimpleExtractorFactory.create(
69		"html-microdata",
70		PopularPrefixes.createSubset("rdf", "doac", "foaf"),
71		Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
72		"example-microdata.html",
73		MicrodataExtractor.class
74		);
75
76		private String documentLanguage;
77
78		private boolean isStrict;
79
80		private String defaultNamespace;
81
82		public ExtractorDescription getDescription() {
83	0	return factory;
84		}
85
86		/**
87		* This extraction performs the
88		* <a href="http://www.w3.org/TR/microdata/#rdf">Microdata to RDF conversion algorithm</a>.
89		* A slight modification of the specification algorithm has been introduced
90		* to avoid performing actions 5.2.1, 5.2.2, 5.2.3, 5.2.4 if step 5.2.6 doesn't detect any
91		* Microdata.
92		*/
93		public void run(
94		ExtractionParameters extractionParameters,
95		ExtractionContext extractionContext,
96		Document in,
97		ExtractionResult out
98		) throws IOException, ExtractionException {
99
100	0	final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
101	0	if(parserReport.getErrors().length > 0) {
102	0	notifyError(parserReport.getErrors(), out);
103		}
104	0	final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
105	0	if (itemScopes.length == 0) {
106	0	return;
107		}
108
109	0	isStrict = extractionParameters.getFlag("any23.microdata.strict");
110	0	if (!isStrict) {
111	0	defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
112		}
113
114	0	documentLanguage = getDocumentLanguage(in);
115
116		/**
117		* 5.2.6
118		*/
119	0	final URI documentURI = extractionContext.getDocumentURI();
120	0	final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
121	0	for (ItemScope itemScope : itemScopes) {
122	0	Resource subject = processType(itemScope, documentURI, out, mappings);
123	0	out.writeTriple(
124		documentURI,
125		MICRODATA_ITEM,
126		subject
127		);
128		}
129
130		/**
131		* 5.2.1
132		*/
133	0	processTitle(in, documentURI, out);
134		/**
135		* 5.2.2
136		*/
137	0	processHREFElements(in, documentURI, out);
138		/**
139		* 5.2.3
140		*/
141	0	processMetaElements(in, documentURI, out);
142
143		/**
144		* 5.2.4
145		*/
146	0	processCiteElements(in, documentURI, out);
147	0	}
148
149		/**
150		* Returns the {@link Document} language if declared, <code>null</code> otherwise.
151		*
152		* @param in a instance of {@link Document}.
153		* @return the language declared, could be <code>null</code>.
154		*/
155		private String getDocumentLanguage(Document in) {
156	0	String lang = DomUtils.find(in, "string(/HTML/@lang)");
157	0	if (lang.equals("")) {
158	0	return null;
159		}
160	0	return lang;
161		}
162
163		/**
164		* Returns the {@link Node} language if declared, or the {@link Document} one
165		* if not defined.
166		*
167		* @param node a {@link Node} instance.
168		* @return the {@link Node} language or the {@link Document} one. Could be <code>null</code>
169		*/
170		private String getLanguage(Node node) {
171	0	Node nodeLang = node.getAttributes().getNamedItem("lang");
172	0	if (nodeLang == null) {
173		// if the element does not specify a lang, use the document one
174	0	return documentLanguage;
175		}
176	0	return nodeLang.getTextContent();
177		}
178
179		/**
180		* Implements step 5.2.1 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
181		* extraction algorithm.
182		*
183		* @param in {@link Document} to be processed.
184		* @param documentURI Document current {@link URI}.
185		* @param out a valid not <code>null</code> {@link ExtractionResult}
186		*/
187		private void processTitle(Document in, URI documentURI, ExtractionResult out) {
188	0	NodeList titles = in.getElementsByTagName("title");
189		// just one title is allowed.
190	0	if (titles.getLength() == 1) {
191	0	Node title = titles.item(0);
192	0	String titleValue = title.getTextContent();
193		Literal object;
194	0	String lang = getLanguage(title);
195	0	if (lang == null) {
196		// unable to decide the language, leave it unknown
197	0	object = RDFUtils.literal(titleValue);
198		} else {
199	0	object = RDFUtils.literal(titleValue, lang);
200		}
201	0	out.writeTriple(
202		documentURI,
203		DCTERMS.getInstance().title,
204		object
205		);
206		}
207	0	}
208
209		/**
210		* Implements step 5.2.2 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
211		* extraction algorithm.
212		*
213		* @param in {@link Document} to be processed.
214		* @param documentURI Document current {@link URI}.
215		* @param out a valid not <code>null</code> {@link ExtractionResult}
216		*/
217		private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
218	0	NodeList anchors = in.getElementsByTagName("a");
219	0	for (int i = 0; i < anchors.getLength(); i++) {
220	0	processHREFElement(anchors.item(i), documentURI, out);
221		}
222	0	NodeList areas = in.getElementsByTagName("area");
223	0	for (int i = 0; i < areas.getLength(); i++) {
224	0	processHREFElement(areas.item(i), documentURI, out);
225		}
226	0	NodeList links = in.getElementsByTagName("link");
227	0	for (int i = 0; i < links.getLength(); i++) {
228	0	processHREFElement(links.item(i), documentURI, out);
229		}
230	0	}
231
232		/**
233		* Implements sub-step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
234		* extraction algorithm.
235		*
236		* @param item {@link Node} to be processed.
237		* @param documentURI Document current {@link URI}.
238		* @param out a valid not <code>null</code> {@link ExtractionResult}
239		*/
240		private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
241	0	Node rel = item.getAttributes().getNamedItem("rel");
242	0	if (rel == null) {
243	0	return;
244		}
245	0	Node href = item.getAttributes().getNamedItem("href");
246	0	if (href == null) {
247	0	return;
248		}
249		URL absoluteURL;
250	0	if (!isAbsoluteURL(href.getTextContent())) {
251		try {
252	0	absoluteURL = toAbsoluteURL(
253		documentURI.toString(),
254		href.getTextContent(),
255		'/'
256		);
257	0	} catch (MalformedURLException e) {
258		// okay, it's not an absolute URL, return
259	0	return;
260	0	}
261		} else {
262		try {
263	0	absoluteURL = new URL(href.getTextContent());
264	0	} catch (MalformedURLException e) {
265		// cannot happen
266	0	return;
267	0	}
268		}
269	0	String[] relTokens = rel.getTextContent().split(" ");
270	0	Set<String> tokensWithNoDuplicates = new HashSet<String>();
271	0	for (String relToken : relTokens) {
272	0	if (relToken.contains(":")) {
273		// if contain semi-colon, skip
274	0	continue;
275		}
276	0	if (relToken.equals("alternate") \|\| relToken.equals("stylesheet")) {
277	0	tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
278	0	continue;
279		}
280	0	tokensWithNoDuplicates.add(relToken.toLowerCase());
281		}
282	0	for (String token : tokensWithNoDuplicates) {
283		URI predicate;
284	0	if (isAbsoluteURL(token)) {
285	0	predicate = RDFUtils.uri(token);
286		} else {
287	0	predicate = RDFUtils.uri(XHTML.NS + token);
288		}
289	0	out.writeTriple(
290		documentURI,
291		predicate,
292		RDFUtils.uri(absoluteURL.toString())
293		);
294	0	}
295	0	}
296
297		/**
298		* Implements step 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
299		* extraction algorithm.
300		*
301		* @param in {@link Document} to be processed.
302		* @param documentURI Document current {@link URI}.
303		* @param out a valid not <code>null</code> {@link ExtractionResult}
304		*/
305		private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
306	0	NodeList metas = in.getElementsByTagName("meta");
307	0	for (int i = 0; i < metas.getLength(); i++) {
308	0	Node meta = metas.item(i);
309	0	String name = DomUtils.readAttribute(meta, "name" , null);
310	0	String content = DomUtils.readAttribute(meta, "content", null);
311	0	if (name != null && content != null) {
312	0	if (isAbsoluteURL(name)) {
313	0	processMetaElement(
314		RDFUtils.uri(name),
315		content,
316		getLanguage(meta),
317		documentURI,
318		out
319		);
320		} else {
321	0	processMetaElement(
322		name,
323		content,
324		getLanguage(meta),
325		documentURI,
326		out
327		);
328		}
329		}
330		}
331	0	}
332
333		/**
334		* Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
335		* extraction algorithm.
336		*
337		* @param uri
338		* @param content
339		* @param language
340		* @param documentURI
341		* @param out
342		*/
343		private void processMetaElement(
344		URI uri,
345		String content,
346		String language,
347		URI documentURI,
348		ExtractionResult out
349		) {
350	0	if (content.contains(":")) {
351		// if it contains U+003A COLON, exit
352	0	return;
353		}
354		Literal subject;
355	0	if (language == null) {
356		// ok, we don't know the language
357	0	subject = RDFUtils.literal(content);
358		} else {
359	0	subject = RDFUtils.literal(content, language);
360		}
361	0	out.writeTriple(
362		documentURI,
363		uri,
364		subject
365		);
366	0	}
367
368		/**
369		* Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
370		* extraction algorithm.
371		*
372		* @param name
373		* @param content
374		* @param language
375		* @param documentURI
376		* @param out
377		*/
378		private void processMetaElement(
379		String name,
380		String content,
381		String language,
382		URI documentURI,
383		ExtractionResult out) {
384		Literal subject;
385	0	if (language == null) {
386		// ok, we don't know the language
387	0	subject = RDFUtils.literal(content);
388		} else {
389	0	subject = RDFUtils.literal(content, language);
390		}
391	0	out.writeTriple(
392		documentURI,
393		RDFUtils.uri(XHTML.NS + name.toLowerCase()),
394		subject
395		);
396	0	}
397
398		/**
399		* Implements sub step for 5.2.4 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
400		* extraction algorithm.
401		*
402		* @param in
403		* @param documentURI
404		* @param out
405		*/
406		private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
407	0	NodeList blockQuotes = in.getElementsByTagName("blockquote");
408	0	for (int i = 0; i < blockQuotes.getLength(); i++) {
409	0	processCiteElement(blockQuotes.item(i), documentURI, out);
410		}
411	0	NodeList quotes = in.getElementsByTagName("q");
412	0	for (int i = 0; i < quotes.getLength(); i++) {
413	0	processCiteElement(quotes.item(i), documentURI, out);
414		}
415	0	}
416
417		private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
418	0	if (item.getAttributes().getNamedItem("cite") != null) {
419	0	out.writeTriple(
420		documentURI,
421		DCTERMS.getInstance().source,
422		RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
423		);
424		}
425	0	}
426
427		/**
428		* Recursive method implementing 5.2.6.1 "generate the triple for the item" of
429		* <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
430		* extraction algorithm.
431		*
432		* @param itemScope
433		* @param documentURI
434		* @param out
435		* @param mappings
436		* @return
437		* @throws ExtractionException
438		*/
439		private Resource processType(
440		ItemScope itemScope,
441		URI documentURI, ExtractionResult out,
442		Map<ItemScope, Resource> mappings
443		) throws ExtractionException {
444		Resource subject;
445	0	if (mappings.containsKey(itemScope)) {
446	0	subject = mappings.get(itemScope);
447	0	} else if (isAbsoluteURL(itemScope.getItemId())) {
448	0	subject = RDFUtils.uri(itemScope.getItemId());
449		} else {
450	0	subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
451		}
452	0	mappings.put(itemScope, subject);
453
454		// ItemScope.type could be null, but surely it's a valid URL
455	0	String itemScopeType = "";
456	0	if (itemScope.getType() != null) {
457		String itemType;
458	0	itemType = itemScope.getType().toString();
459	0	out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
460	0	itemScopeType = itemScope.getType().toString();
461		}
462	0	for (String propName : itemScope.getProperties().keySet()) {
463	0	List<ItemProp> itemProps = itemScope.getProperties().get(propName);
464	0	for (ItemProp itemProp : itemProps) {
465		try {
466	0	processProperty(
467		subject,
468		propName,
469		itemProp,
470		itemScopeType,
471		documentURI,
472		mappings,
473		out
474		);
475	0	} catch (MalformedURLException e) {
476	0	throw new ExtractionException(
477		"Error while processing on subject '" + subject +
478		"' the itemProp: '" + itemProp + "' "
479		);
480	0	}
481		}
482	0	}
483	0	return subject;
484		}
485
486		private void processProperty(
487		Resource subject,
488		String propName,
489		ItemProp itemProp,
490		String itemScopeType,
491		URI documentURI,
492		Map<ItemScope, Resource> mappings,
493		ExtractionResult out
494		) throws MalformedURLException, ExtractionException {
495		URI predicate;
496	0	if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
497	0	return;
498	0	} else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
499	0	predicate = RDFUtils.uri(
500		toAbsoluteURL(
501		defaultNamespace,
502		propName,
503		'/'
504		).toString()
505		);
506		} else {
507	0	predicate = RDFUtils.uri(
508		toAbsoluteURL(
509		itemScopeType,
510		propName,
511		'/'
512		).toString());
513		}
514		Value value;
515	0	Object propValue = itemProp.getValue().getContent();
516	0	ItemPropValue.Type propType = itemProp.getValue().getType();
517	0	if (propType.equals(ItemPropValue.Type.Nested)) {
518	0	value = processType((ItemScope) propValue, documentURI, out, mappings);
519	0	} else if (propType.equals(ItemPropValue.Type.Plain)) {
520	0	value = RDFUtils.literal((String) propValue, documentLanguage);
521	0	} else if (propType.equals(ItemPropValue.Type.Link)) {
522	0	value = RDFUtils.uri(
523		toAbsoluteURL(
524		documentURI.toString(),
525		(String) propValue,
526		'/'
527		).toString()
528		);
529	0	} else if (propType.equals(ItemPropValue.Type.Date)) {
530	0	value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
531		} else {
532	0	throw new RuntimeException("Invalid Type '" +
533		propType + "' for ItemPropValue with name: '" + propName + "'");
534		}
535	0	out.writeTriple(subject, predicate, value);
536	0	}
537
538		private boolean isAbsoluteURL(String urlString) {
539	0	boolean result = false;
540		try {
541	0	URL url = new URL(urlString);
542	0	String protocol = url.getProtocol();
543	0	if (protocol != null && protocol.trim().length() > 0)
544	0	result = true;
545	0	} catch (MalformedURLException e) {
546	0	return false;
547	0	}
548	0	return result;
549		}
550
551		private URL toAbsoluteURL(String ns, String part, char trailing)
552		throws MalformedURLException {
553	0	if (isAbsoluteURL(part)) {
554	0	return new URL(part);
555		}
556	0	char lastChar = ns.charAt(ns.length() - 1);
557	0	if (lastChar == '#' \|\| lastChar == '/')
558	0	return new URL(ns + part);
559	0	return new URL(ns + trailing + part);
560		}
561
562		private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
563	0	for(MicrodataParserException mpe : errors) {
564	0	out.notifyError(
565		ErrorReporter.ErrorLevel.ERROR,
566		mpe.toJSON(),
567		mpe.getErrorLocationBeginRow() ,
568		mpe.getErrorLocationBeginCol()
569		);
570		}
571	0	}
572
573		}