Coverage Report - org.apache.any23.extractor.html.HCalendarExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HCalendarExtractor
0%
0/76
0%
0/24
2.5
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionException;
 21  
 import org.apache.any23.extractor.ExtractorDescription;
 22  
 import org.apache.any23.extractor.ExtractorFactory;
 23  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 24  
 import org.apache.any23.extractor.TagSoupExtractionResult;
 25  
 import org.apache.any23.rdf.PopularPrefixes;
 26  
 import org.apache.any23.rdf.RDFUtils;
 27  
 import org.apache.any23.vocab.ICAL;
 28  
 import org.openrdf.model.BNode;
 29  
 import org.openrdf.model.Resource;
 30  
 import org.openrdf.model.URI;
 31  
 import org.openrdf.model.vocabulary.RDF;
 32  
 import org.w3c.dom.Node;
 33  
 
 34  
 import javax.xml.datatype.DatatypeConfigurationException;
 35  
 import java.text.ParseException;
 36  
 import java.util.Arrays;
 37  
 import java.util.List;
 38  
 
 39  
 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
 40  
 
 41  
 
 42  
 /**
 43  
  * Extractor for the <a href="http://microformats.org/wiki/hcalendar">hCalendar</a>
 44  
  * microformat.
 45  
  *
 46  
  * @author Gabriele Renzi
 47  
  */
 48  0
 public class HCalendarExtractor extends MicroformatExtractor {
 49  
 
 50  0
     private static final ICAL vICAL = ICAL.getInstance();
 51  
 
 52  0
     public final static ExtractorFactory<HCalendarExtractor> factory =
 53  
             SimpleExtractorFactory.create(
 54  
                     "html-mf-hcalendar",
 55  
                     PopularPrefixes.createSubset("rdf", "ical"),
 56  
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
 57  
                     "example-mf-hcalendar.html",
 58  
                     HCalendarExtractor.class);
 59  
 
 60  0
     private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"};
 61  
 
 62  
     private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
 63  
 
 64  0
     private String[] textSingularProps = {
 65  
             "summary",
 66  
             "class",
 67  
             "transp",
 68  
             "description",
 69  
             "status",
 70  
             "location"};
 71  
 
 72  0
     private String[] textDateProps = {
 73  
             "dtstart",
 74  
             "dtstamp",
 75  
             "dtend",
 76  
     };
 77  
 
 78  
     public ExtractorDescription getDescription() {
 79  0
         return factory;
 80  
     }
 81  
 
 82  
     @Override
 83  
     protected boolean extract() throws ExtractionException {
 84  0
         final HTMLDocument document = getHTMLDocument();
 85  0
         List<Node> calendars = document.findAllByClassName("vcalendar");
 86  0
         if (calendars.size() == 0)
 87  
             // vcal allows to avoid top name, in which case whole document is
 88  
             // the calendar, let's try
 89  0
             if (document.findAllByClassName("vevent").size() > 0)
 90  0
                 calendars.add(document.getDocument());
 91  
 
 92  0
         boolean foundAny = false;
 93  0
         for (Node node : calendars)
 94  0
             foundAny |= extractCalendar(node);
 95  
 
 96  0
         return foundAny;
 97  
     }
 98  
 
 99  
     private boolean extractCalendar(Node node) throws ExtractionException {
 100  0
         URI cal = getDocumentURI();
 101  0
         addURIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
 102  0
         return addComponents(node, cal);
 103  
     }
 104  
 
 105  
     private boolean addComponents(Node node, Resource cal) throws ExtractionException {
 106  0
         boolean foundAny = false;
 107  0
         for (String component : Components) {
 108  0
             List<Node> events = DomUtils.findAllByClassName(node, component);
 109  0
             if (events.size() == 0)
 110  0
                 continue;
 111  0
             for (Node evtNode : events)
 112  0
                 foundAny |= extractComponent(evtNode, cal, component);
 113  
         }
 114  0
         return foundAny;
 115  
     }
 116  
 
 117  
     private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
 118  0
         HTMLDocument compoNode = new HTMLDocument(node);
 119  0
         BNode evt = valueFactory.createBNode();
 120  0
         addURIProperty(evt, RDF.TYPE, vICAL.getClass(component));
 121  0
         addTextProps(compoNode, evt);
 122  0
         addUrl(compoNode, evt);
 123  0
         addRRule(compoNode, evt);
 124  0
         addOrganizer(compoNode, evt);
 125  0
         addUid(compoNode, evt);
 126  0
         addBNodeProperty(cal, vICAL.component, evt);
 127  
 
 128  0
         final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
 129  0
         tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() );
 130  
 
 131  0
         return true;
 132  
     }
 133  
 
 134  
     private void addUid(HTMLDocument compoNode, Resource evt) {
 135  0
         TextField url = compoNode.getSingularUrlField("uid");
 136  0
         conditionallyAddStringProperty(
 137  
                 compoNode.getDocument(),
 138  
                 evt, vICAL.uid, url.value()
 139  
         );
 140  0
     }
 141  
 
 142  
     private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
 143  0
         TextField url = compoNode.getSingularUrlField("url");
 144  0
         if ("".equals(url.value())) return;
 145  0
         addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
 146  0
     }
 147  
 
 148  
     private void addRRule(HTMLDocument compoNode, Resource evt) {
 149  0
         for (Node rule : compoNode.findAllByClassName("rrule")) {
 150  0
             BNode rrule = valueFactory.createBNode();
 151  0
             addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
 152  0
             TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
 153  0
             conditionallyAddStringProperty(
 154  
                     freq.source(),
 155  
                     rrule, vICAL.freq, freq.value()
 156  
             );
 157  0
             addBNodeProperty(
 158  
                     rule,
 159  
                     evt, vICAL.rrule, rrule
 160  
             );
 161  0
         }
 162  0
     }
 163  
 
 164  
     private void addOrganizer(HTMLDocument compoNode, Resource evt) {
 165  0
         for (Node organizer : compoNode.findAllByClassName("organizer")) {
 166  
             //untyped
 167  0
             BNode blank = valueFactory.createBNode();
 168  0
             TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
 169  0
             conditionallyAddStringProperty(
 170  
                     compoNode.getDocument(),
 171  
                     blank, vICAL.calAddress, mail.value()
 172  
             );
 173  0
             addBNodeProperty(
 174  
                     organizer,
 175  
                     evt, vICAL.organizer, blank
 176  
             );
 177  0
         }
 178  0
     }
 179  
 
 180  
     private void addTextProps(HTMLDocument node, Resource evt) {
 181  0
         for (String date : textSingularProps) {
 182  0
             HTMLDocument.TextField val = node.getSingularTextField(date);
 183  0
             conditionallyAddStringProperty(
 184  
                     val.source(),
 185  
                     evt, vICAL.getProperty(date), val.value()
 186  
             );
 187  
         }
 188  
 
 189  0
         for (String date : textDateProps) {
 190  0
             HTMLDocument.TextField val = node.getSingularTextField(date);
 191  
             try {
 192  0
                 conditionallyAddStringProperty(
 193  
                         val.source(),
 194  
                         evt,
 195  
                         vICAL.getProperty(date),
 196  
                         RDFUtils.getXSDDate(
 197  
                                 val.value(),
 198  
                                 DATE_FORMAT
 199  
                         )
 200  
                 );
 201  0
             } catch (ParseException e) {
 202  
                 // Unparsable date format just leave it as it is.
 203  0
                 conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value());
 204  0
             } catch (DatatypeConfigurationException e) {
 205  
                 // Unparsable date format just leave it as it is
 206  0
                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
 207  0
             }
 208  
         }
 209  
 
 210  0
         HTMLDocument.TextField[] values = node.getPluralTextField("category");
 211  0
         for (TextField val : values) {
 212  0
             conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
 213  
         }
 214  0
     }
 215  
 
 216  
 }