Coverage Report

Coverage Report - org.apache.any23.extractor.html.TitleExtractor

Classes in this File

0/11

0/4

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.rdf.PopularPrefixes;
 import org.apache.any23.vocab.DCTERMS;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.openrdf.model.impl.ValueFactoryImpl;
 import org.w3c.dom.Document;
 
 import java.io.IOException;
 import java.util.Arrays;
 
 /**
  * Extracts the value of the &lt;title&gt; element of an
  * HTML or XHTML page.
  *
  * @author Richard Cyganiak (richard@cyganiak.de)
  */
 public class TitleExtractor implements TagSoupDOMExtractor {
 
     public static final String NAME = "html-head-title";
 
     private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
 
     public final static ExtractorFactory<TitleExtractor> factory =
             SimpleExtractorFactory.create(
                     NAME,
                     PopularPrefixes.createSubset("dcterms"),
                     Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
                     "example-title.html",
                     TitleExtractor.class
             );
 
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
         final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
             ValueFactoryImpl.getInstance(), out, extractionContext.getDefaultLanguage()
         );
         
         try {
             String title = DomUtils.find(in, "/HTML/HEAD/TITLE/text()").trim();
             if (title != null && (title.length() != 0)) {
                 out.writeTriple(extractionContext.getDocumentURI(), vDCTERMS.title, valueFactory.createLiteral(title));
             }
         } finally {
             valueFactory.setErrorReporter(null);
         }
     }
 
     public ExtractorDescription getDescription() {
         return factory;
     }
     
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionContext;
21		import org.apache.any23.extractor.ExtractionException;
22		import org.apache.any23.extractor.ExtractionParameters;
23		import org.apache.any23.extractor.ExtractionResult;
24		import org.apache.any23.extractor.ExtractorDescription;
25		import org.apache.any23.extractor.ExtractorFactory;
26		import org.apache.any23.extractor.SimpleExtractorFactory;
27		import org.apache.any23.rdf.Any23ValueFactoryWrapper;
28		import org.apache.any23.rdf.PopularPrefixes;
29		import org.apache.any23.vocab.DCTERMS;
30		import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
31		import org.openrdf.model.impl.ValueFactoryImpl;
32		import org.w3c.dom.Document;
33
34		import java.io.IOException;
35		import java.util.Arrays;
36
37		/**
38		* Extracts the value of the <title> element of an
39		* HTML or XHTML page.
40		*
41		* @author Richard Cyganiak (richard@cyganiak.de)
42		*/
43	0	public class TitleExtractor implements TagSoupDOMExtractor {
44
45		public static final String NAME = "html-head-title";
46
47	0	private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
48
49	0	public final static ExtractorFactory<TitleExtractor> factory =
50		SimpleExtractorFactory.create(
51		NAME,
52		PopularPrefixes.createSubset("dcterms"),
53		Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
54		"example-title.html",
55		TitleExtractor.class
56		);
57
58		public void run(
59		ExtractionParameters extractionParameters,
60		ExtractionContext extractionContext,
61		Document in,
62		ExtractionResult out
63		) throws IOException, ExtractionException {
64	0	final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
65		ValueFactoryImpl.getInstance(), out, extractionContext.getDefaultLanguage()
66		);
67
68		try {
69	0	String title = DomUtils.find(in, "/HTML/HEAD/TITLE/text()").trim();
70	0	if (title != null && (title.length() != 0)) {
71	0	out.writeTriple(extractionContext.getDocumentURI(), vDCTERMS.title, valueFactory.createLiteral(title));
72		}
73		} finally {
74	0	valueFactory.setErrorReporter(null);
75	0	}
76	0	}
77
78		public ExtractorDescription getDescription() {
79	0	return factory;
80		}
81
82		}