Coverage Report

Coverage Report - org.apache.any23.extractor.html.HeadLinkExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

HeadLinkExtractor

0/18

0/10

3.5

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.rdf.PopularPrefixes;
 import org.apache.any23.vocab.XHTML;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.openrdf.model.URI;
 import org.openrdf.model.ValueFactory;
 import org.openrdf.model.impl.ValueFactoryImpl;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
 
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 
 /**
  * This {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor} implementation
  * retrieves the <code>LINK</code>s declared within the <code>HTML/HEAD</code> page header.
  */
 public class HeadLinkExtractor implements TagSoupDOMExtractor {
 
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
         HTMLDocument html = new HTMLDocument(in);
         ValueFactory vf = ValueFactoryImpl.getInstance();
 
         final List<Node> headLinkNodes = DomUtils.findAll(
                 in,
                 "/HTML/HEAD/LINK[(" +
                         "@type='application/rdf+xml' or " +
                         "@type='text/rdf' or " +
                         "@type='application/x-turtle' or " +
                         "@type='application/turtle' or " +
                         "@type='text/turtle' or " +
                         "@type='text/rdf+n3'" +
                         ") and @href and @rel]"
         );
         for (Node node : headLinkNodes) {
             final URI href = html.resolveURI(DomUtils.find(node, "@href"));
             final String rel = DomUtils.find(node, "@rel");
             out.writeTriple(
                     extractionContext.getDocumentURI(),
                     vf.createURI(XHTML.NS + rel),
                     href
             );
             final String title = DomUtils.find(node, "@title");
             if (title != null && !"".equals(title)) {
                 out.writeTriple(
                         href,
                         factory.getPrefixes().expand("dcterms:title"),
                         vf.createLiteral(title)
                 );
             }
             final String type = DomUtils.find(node, "@type");
             if (type != null && !"".equals(type)) {
                 out.writeTriple(
                         href,
                         factory.getPrefixes().expand("dcterms:format"),
                         vf.createLiteral(type)
                 );
             }
         }
     }
 
     public ExtractorDescription getDescription() {
         return factory;
     }
 
     public final static ExtractorFactory<HeadLinkExtractor> factory =
             SimpleExtractorFactory.create(
                     "html-head-links",
                     PopularPrefixes.createSubset("xhtml", "dcterms"),
                     Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
                     "example-head-link.html",
                     HeadLinkExtractor.class);
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionContext;
21		import org.apache.any23.extractor.ExtractionException;
22		import org.apache.any23.extractor.ExtractionParameters;
23		import org.apache.any23.extractor.ExtractionResult;
24		import org.apache.any23.extractor.ExtractorDescription;
25		import org.apache.any23.extractor.ExtractorFactory;
26		import org.apache.any23.extractor.SimpleExtractorFactory;
27		import org.apache.any23.rdf.PopularPrefixes;
28		import org.apache.any23.vocab.XHTML;
29		import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30		import org.openrdf.model.URI;
31		import org.openrdf.model.ValueFactory;
32		import org.openrdf.model.impl.ValueFactoryImpl;
33		import org.w3c.dom.Document;
34		import org.w3c.dom.Node;
35
36		import java.io.IOException;
37		import java.util.Arrays;
38		import java.util.List;
39
40		/**
41		* This {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor} implementation
42		* retrieves the <code>LINK</code>s declared within the <code>HTML/HEAD</code> page header.
43		*/
44	0	public class HeadLinkExtractor implements TagSoupDOMExtractor {
45
46		public void run(
47		ExtractionParameters extractionParameters,
48		ExtractionContext extractionContext,
49		Document in,
50		ExtractionResult out
51		) throws IOException, ExtractionException {
52	0	HTMLDocument html = new HTMLDocument(in);
53	0	ValueFactory vf = ValueFactoryImpl.getInstance();
54
55	0	final List<Node> headLinkNodes = DomUtils.findAll(
56		in,
57		"/HTML/HEAD/LINK[(" +
58		"@type='application/rdf+xml' or " +
59		"@type='text/rdf' or " +
60		"@type='application/x-turtle' or " +
61		"@type='application/turtle' or " +
62		"@type='text/turtle' or " +
63		"@type='text/rdf+n3'" +
64		") and @href and @rel]"
65		);
66	0	for (Node node : headLinkNodes) {
67	0	final URI href = html.resolveURI(DomUtils.find(node, "@href"));
68	0	final String rel = DomUtils.find(node, "@rel");
69	0	out.writeTriple(
70		extractionContext.getDocumentURI(),
71		vf.createURI(XHTML.NS + rel),
72		href
73		);
74	0	final String title = DomUtils.find(node, "@title");
75	0	if (title != null && !"".equals(title)) {
76	0	out.writeTriple(
77		href,
78		factory.getPrefixes().expand("dcterms:title"),
79		vf.createLiteral(title)
80		);
81		}
82	0	final String type = DomUtils.find(node, "@type");
83	0	if (type != null && !"".equals(type)) {
84	0	out.writeTriple(
85		href,
86		factory.getPrefixes().expand("dcterms:format"),
87		vf.createLiteral(type)
88		);
89		}
90	0	}
91	0	}
92
93		public ExtractorDescription getDescription() {
94	0	return factory;
95		}
96
97	0	public final static ExtractorFactory<HeadLinkExtractor> factory =
98		SimpleExtractorFactory.create(
99		"html-head-links",
100		PopularPrefixes.createSubset("xhtml", "dcterms"),
101		Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
102		"example-head-link.html",
103		HeadLinkExtractor.class);
104		}