1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdfa;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractorFactory;
22 import org.apache.any23.rdf.RDFUtils;
23 import org.apache.any23.vocab.FOAF;
24 import org.apache.any23.vocab.OGP;
25 import org.apache.any23.vocab.OGPMusic;
26 import org.junit.Assert;
27 import org.junit.Test;
28 import org.eclipse.rdf4j.model.Literal;
29 import org.eclipse.rdf4j.model.Statement;
30 import org.eclipse.rdf4j.model.Value;
31 import org.eclipse.rdf4j.model.vocabulary.RDF;
32 import org.eclipse.rdf4j.repository.RepositoryException;
33 import org.eclipse.rdf4j.repository.RepositoryResult;
34 import org.eclipse.rdf4j.rio.RDFHandlerException;
35 import org.eclipse.rdf4j.rio.RDFParseException;
36
37 import java.io.IOException;
38
39
40
41
42
43
44
45 public class RDFa11ExtractorTest extends AbstractRDFaExtractorTestCase {
46
47
48
49
50
51
52
53 @Test
54 public void testObjectResourceConversion() throws RepositoryException {
55 assertExtract("/html/rdfa/object-resource-test.html");
56 logger.debug(dumpModelToTurtle());
57 assertContains(null, FOAF.getInstance().page, RDFUtils.iri("http://en.wikipedia.org/New_York"));
58 }
59
60 @Test
61 public void testBBCNewsScotland() {
62 assertExtract("/html/BBC_News_Scotland.html");
63 assertModelNotEmpty();
64 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
65 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#navigation"), 1);
66 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
67 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#search"), 1);
68 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
69 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#contentinfo"), 1);
70 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
71 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#presentation"), 8);
72 }
73
74 @Test
75 public void testInvalidXMLCharacter() {
76 assertExtract("/html/rdfa/invalid-xml-character.html");
77 assertModelNotEmpty();
78 }
79
80 @Test
81 public void testAttributeAlreadySpecified() {
82 assertExtract("/html/rdfa/attribute-already-specified.html");
83 assertModelNotEmpty();
84 }
85
86 @Test
87 public void test0087() {
88 assertExtract("/html/rdfa/0087.xhtml");
89 assertModelNotEmpty();
90 assertStatementsSize(null, null, null, 24);
91 assertContains(RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#stylesheet"),
92 RDFUtils.iri("http://example.org/stylesheet"));
93 }
94
95 @Test
96 public void testBasicWithSyntaxErrors() {
97
98 assertExtract("/html/rdfa/basic-with-errors.html");
99 assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", "en"));
100 assertContains(null, vDCTERMS.title, RDFUtils.literal("The trouble with Bob", "en"));
101 assertContains(null, RDFUtils.iri("http://fake.org/prop"), RDFUtils.literal("Mary", "en"));
102 }
103
104 @Test
105 public void testIssue326() {
106 assertExtract("/html/rdfa/rdfa-issue326-and-267.html");
107 }
108
109 @Test
110 public void testIssue227() {
111 assertExtract("/html/rdfa/rdfa-issue227.html");
112 logger.debug(dumpModelToTurtle());
113 assertContains(baseIRI, RDFUtils.iri("http://ogp.me/ns#title"),
114 "Bread — Free listening, videos, concerts, stats and photos at Last.fm", "en");
115 }
116
117 @Test
118 public void testIssue271AndJavascriptParsing() {
119 assertExtract("/html/rdfa/rdfa-issue271-and-317.html");
120 logger.debug(dumpModelToTurtle());
121 assertModelNotEmpty();
122 }
123
124 @Test
125 public void testIssue273() {
126 assertExtract("/html/rdfa/rdfa-issue273-and-317.html");
127 assertModelNotEmpty();
128 }
129
130 @Test
131 public void testIssue268And317() {
132 assertExtract("/html/rdfa/rdfa-issue268-and-317.html");
133 }
134
135
136
137
138
139
140
141
142
143 @Test
144 public void testExplicitDatatypeDeclaration() throws RepositoryException {
145 assertExtract("/html/rdfa/xmlliteral-datatype-test.html");
146 logger.debug(dumpModelToTurtle());
147
148 RepositoryResult<Statement> stmts = conn
149 .getStatements(RDFUtils.iri("http://dbpedia.org/resource/Albert_Einstein"), vFOAF.name, null, false);
150 Assert.assertTrue(stmts.hasNext());
151 Value obj = stmts.next().getObject();
152 Assert.assertTrue(obj instanceof Literal);
153 Literal lit = (Literal) obj;
154 Assert.assertEquals(lit.getDatatype(), RDF.XMLLITERAL);
155 Assert.assertEquals(lit.getLabel(),
156 "Albert <strong xmlns=\"http://www.w3.org/1999/xhtml\" " + "xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" "
157 + "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" "
158 + "xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" "
159 + "xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\">Einstein</strong>");
160 }
161
162
163
164
165
166
167
168 @Test
169 public void testRelWithHref() throws RepositoryException {
170 assertExtract("/html/rdfa/rel-href.html");
171 logger.debug(dumpModelToTurtle());
172
173 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), FOAF.getInstance().name, "John Doe");
174 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), FOAF.getInstance().homepage,
175 RDFUtils.iri("http://example.org/blog/"));
176 }
177
178
179
180
181
182
183
184 @Test
185 public void testRelRevSupport() throws RepositoryException {
186 assertExtract("/html/rdfa/rel-rev.html");
187 logger.debug(dumpModelToTurtle());
188
189 assertContains(baseIRI, RDFUtils.iri("http://bob.example.com/cite"),
190 RDFUtils.iri("http://www.example.com/books/the_two_towers"));
191 assertContains(RDFUtils.iri("http://path/to/chapter"), RDFUtils.iri("http://bob.example.com/isChapterOf"),
192 baseIRI);
193 }
194
195
196
197
198
199
200
201 @Test
202 public void testVocabSupport() throws RepositoryException {
203 assertExtract("/html/rdfa/vocab.html");
204 logger.debug(dumpModelToTurtle());
205
206 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), RDFUtils.iri("http://xmlns.com/foaf/0.1/name"),
207 RDFUtils.literal("John Doe"));
208 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), RDFUtils.iri("http://xmlns.com/foaf/0.1/homepage"),
209 RDFUtils.iri("http://example.org/blog/"));
210 }
211
212 @Test
213 public void testVocabWithoutTrailingSlash() {
214
215 assertExtract("/html/rdfa/vocab-without-trailing-slash.html");
216
217 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/BreadcrumbList"));
218 }
219
220
221
222
223 @Test
224 public void testTolerantParsing() {
225 assertExtract("/html/rdfa/oreilly-invalid-datatype.html", false);
226 }
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241 @Test
242 public void testRDFa10Extraction() throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
243 final int EXPECTED_STATEMENTS = 31;
244
245 assertExtract("/html/rdfa/goodrelations-rdfa10.html");
246 logger.debug(dumpModelToNQuads());
247
248 Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
249 assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
250 }
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265 @Test
266 public void testRDFa11Extraction() throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
267 final int EXPECTED_STATEMENTS = 31;
268
269 assertExtract("/html/rdfa/goodrelations-rdfa11.html");
270 logger.debug(dumpHumanReadableTriples());
271
272 Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
273 assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
274 }
275
276
277
278
279
280
281
282
283
284
285
286
287
288 @Test
289 public void testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException {
290 assertExtract("/html/rdfa/opengraph-structured-properties.html");
291 logger.debug(dumpHumanReadableTriples());
292
293 Assert.assertEquals(31, getStatementsSize(null, null, null));
294 final OGP vOGP = OGP.getInstance();
295 assertContains(baseIRI, vOGP.audio, RDFUtils.literal("http://example.com/sound.mp3"));
296 assertContains(baseIRI, vOGP.description, RDFUtils
297 .literal("Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond."));
298 assertContains(baseIRI, vOGP.determiner, RDFUtils.literal("the"));
299 assertContains(baseIRI, vOGP.locale, RDFUtils.literal("en_GB"));
300 assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("fr_FR"));
301 assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("es_ES"));
302 assertContains(baseIRI, vOGP.siteName, RDFUtils.literal("IMDb"));
303 assertContains(baseIRI, vOGP.video, RDFUtils.literal("http://example.com/bond/trailer.swf"));
304 }
305
306 @Override
307 protected ExtractorFactory<?> getExtractorFactory() {
308 return new RDFa11ExtractorFactory();
309 }
310
311
312
313
314
315
316
317
318
319
320
321 @Test
322 public void testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException {
323 assertExtract("/html/rdfa/opengraph-music-song-object-type.html");
324 logger.debug(dumpHumanReadableTriples());
325
326 Assert.assertEquals(9, getStatementsSize(null, null, null));
327 final OGPMusic vOGPMusic = OGPMusic.getInstance();
328 assertContains(baseIRI, vOGPMusic.musicDuration, RDFUtils.literal("447"));
329 assertContains(baseIRI, vOGPMusic.musicMusician,
330 RDFUtils.literal("Jono Grant / Tony McGuinness / Ashley Tomberlin"));
331 assertContains(baseIRI, vOGPMusic.musicAlbum, RDFUtils.literal("Tri-State"));
332 }
333
334 }