1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.html; |
19 | |
|
20 | |
import org.apache.any23.extractor.ExtractionException; |
21 | |
import org.apache.any23.extractor.ExtractorDescription; |
22 | |
import org.apache.any23.extractor.ExtractorFactory; |
23 | |
import org.apache.any23.extractor.SimpleExtractorFactory; |
24 | |
import org.apache.any23.extractor.TagSoupExtractionResult; |
25 | |
import org.apache.any23.rdf.PopularPrefixes; |
26 | |
import org.apache.any23.rdf.RDFUtils; |
27 | |
import org.apache.any23.vocab.ICAL; |
28 | |
import org.openrdf.model.BNode; |
29 | |
import org.openrdf.model.Resource; |
30 | |
import org.openrdf.model.URI; |
31 | |
import org.openrdf.model.vocabulary.RDF; |
32 | |
import org.w3c.dom.Node; |
33 | |
|
34 | |
import javax.xml.datatype.DatatypeConfigurationException; |
35 | |
import java.text.ParseException; |
36 | |
import java.util.Arrays; |
37 | |
import java.util.List; |
38 | |
|
39 | |
import static org.apache.any23.extractor.html.HTMLDocument.TextField; |
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | 0 | public class HCalendarExtractor extends MicroformatExtractor { |
49 | |
|
50 | 0 | private static final ICAL vICAL = ICAL.getInstance(); |
51 | |
|
52 | 0 | public final static ExtractorFactory<HCalendarExtractor> factory = |
53 | |
SimpleExtractorFactory.create( |
54 | |
"html-mf-hcalendar", |
55 | |
PopularPrefixes.createSubset("rdf", "ical"), |
56 | |
Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), |
57 | |
"example-mf-hcalendar.html", |
58 | |
HCalendarExtractor.class); |
59 | |
|
60 | 0 | private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"}; |
61 | |
|
62 | |
private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'"; |
63 | |
|
64 | 0 | private String[] textSingularProps = { |
65 | |
"summary", |
66 | |
"class", |
67 | |
"transp", |
68 | |
"description", |
69 | |
"status", |
70 | |
"location"}; |
71 | |
|
72 | 0 | private String[] textDateProps = { |
73 | |
"dtstart", |
74 | |
"dtstamp", |
75 | |
"dtend", |
76 | |
}; |
77 | |
|
78 | |
public ExtractorDescription getDescription() { |
79 | 0 | return factory; |
80 | |
} |
81 | |
|
82 | |
@Override |
83 | |
protected boolean extract() throws ExtractionException { |
84 | 0 | final HTMLDocument document = getHTMLDocument(); |
85 | 0 | List<Node> calendars = document.findAllByClassName("vcalendar"); |
86 | 0 | if (calendars.size() == 0) |
87 | |
|
88 | |
|
89 | 0 | if (document.findAllByClassName("vevent").size() > 0) |
90 | 0 | calendars.add(document.getDocument()); |
91 | |
|
92 | 0 | boolean foundAny = false; |
93 | 0 | for (Node node : calendars) |
94 | 0 | foundAny |= extractCalendar(node); |
95 | |
|
96 | 0 | return foundAny; |
97 | |
} |
98 | |
|
99 | |
private boolean extractCalendar(Node node) throws ExtractionException { |
100 | 0 | URI cal = getDocumentURI(); |
101 | 0 | addURIProperty(cal, RDF.TYPE, vICAL.Vcalendar); |
102 | 0 | return addComponents(node, cal); |
103 | |
} |
104 | |
|
105 | |
private boolean addComponents(Node node, Resource cal) throws ExtractionException { |
106 | 0 | boolean foundAny = false; |
107 | 0 | for (String component : Components) { |
108 | 0 | List<Node> events = DomUtils.findAllByClassName(node, component); |
109 | 0 | if (events.size() == 0) |
110 | 0 | continue; |
111 | 0 | for (Node evtNode : events) |
112 | 0 | foundAny |= extractComponent(evtNode, cal, component); |
113 | |
} |
114 | 0 | return foundAny; |
115 | |
} |
116 | |
|
117 | |
private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException { |
118 | 0 | HTMLDocument compoNode = new HTMLDocument(node); |
119 | 0 | BNode evt = valueFactory.createBNode(); |
120 | 0 | addURIProperty(evt, RDF.TYPE, vICAL.getClass(component)); |
121 | 0 | addTextProps(compoNode, evt); |
122 | 0 | addUrl(compoNode, evt); |
123 | 0 | addRRule(compoNode, evt); |
124 | 0 | addOrganizer(compoNode, evt); |
125 | 0 | addUid(compoNode, evt); |
126 | 0 | addBNodeProperty(cal, vICAL.component, evt); |
127 | |
|
128 | 0 | final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); |
129 | 0 | tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() ); |
130 | |
|
131 | 0 | return true; |
132 | |
} |
133 | |
|
134 | |
private void addUid(HTMLDocument compoNode, Resource evt) { |
135 | 0 | TextField url = compoNode.getSingularUrlField("uid"); |
136 | 0 | conditionallyAddStringProperty( |
137 | |
compoNode.getDocument(), |
138 | |
evt, vICAL.uid, url.value() |
139 | |
); |
140 | 0 | } |
141 | |
|
142 | |
private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException { |
143 | 0 | TextField url = compoNode.getSingularUrlField("url"); |
144 | 0 | if ("".equals(url.value())) return; |
145 | 0 | addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value())); |
146 | 0 | } |
147 | |
|
148 | |
private void addRRule(HTMLDocument compoNode, Resource evt) { |
149 | 0 | for (Node rule : compoNode.findAllByClassName("rrule")) { |
150 | 0 | BNode rrule = valueFactory.createBNode(); |
151 | 0 | addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule); |
152 | 0 | TextField freq = new HTMLDocument(rule).getSingularTextField("freq"); |
153 | 0 | conditionallyAddStringProperty( |
154 | |
freq.source(), |
155 | |
rrule, vICAL.freq, freq.value() |
156 | |
); |
157 | 0 | addBNodeProperty( |
158 | |
rule, |
159 | |
evt, vICAL.rrule, rrule |
160 | |
); |
161 | 0 | } |
162 | 0 | } |
163 | |
|
164 | |
private void addOrganizer(HTMLDocument compoNode, Resource evt) { |
165 | 0 | for (Node organizer : compoNode.findAllByClassName("organizer")) { |
166 | |
|
167 | 0 | BNode blank = valueFactory.createBNode(); |
168 | 0 | TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer"); |
169 | 0 | conditionallyAddStringProperty( |
170 | |
compoNode.getDocument(), |
171 | |
blank, vICAL.calAddress, mail.value() |
172 | |
); |
173 | 0 | addBNodeProperty( |
174 | |
organizer, |
175 | |
evt, vICAL.organizer, blank |
176 | |
); |
177 | 0 | } |
178 | 0 | } |
179 | |
|
180 | |
private void addTextProps(HTMLDocument node, Resource evt) { |
181 | 0 | for (String date : textSingularProps) { |
182 | 0 | HTMLDocument.TextField val = node.getSingularTextField(date); |
183 | 0 | conditionallyAddStringProperty( |
184 | |
val.source(), |
185 | |
evt, vICAL.getProperty(date), val.value() |
186 | |
); |
187 | |
} |
188 | |
|
189 | 0 | for (String date : textDateProps) { |
190 | 0 | HTMLDocument.TextField val = node.getSingularTextField(date); |
191 | |
try { |
192 | 0 | conditionallyAddStringProperty( |
193 | |
val.source(), |
194 | |
evt, |
195 | |
vICAL.getProperty(date), |
196 | |
RDFUtils.getXSDDate( |
197 | |
val.value(), |
198 | |
DATE_FORMAT |
199 | |
) |
200 | |
); |
201 | 0 | } catch (ParseException e) { |
202 | |
|
203 | 0 | conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value()); |
204 | 0 | } catch (DatatypeConfigurationException e) { |
205 | |
|
206 | 0 | conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value()); |
207 | 0 | } |
208 | |
} |
209 | |
|
210 | 0 | HTMLDocument.TextField[] values = node.getPluralTextField("category"); |
211 | 0 | for (TextField val : values) { |
212 | 0 | conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value()); |
213 | |
} |
214 | 0 | } |
215 | |
|
216 | |
} |