1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.csv; |
19 | |
|
20 | |
import org.apache.any23.extractor.ExtractionContext; |
21 | |
import org.apache.any23.extractor.ExtractionException; |
22 | |
import org.apache.any23.extractor.ExtractionParameters; |
23 | |
import org.apache.any23.extractor.ExtractionResult; |
24 | |
import org.apache.any23.extractor.Extractor; |
25 | |
import org.apache.any23.extractor.ExtractorDescription; |
26 | |
import org.apache.any23.extractor.ExtractorFactory; |
27 | |
import org.apache.any23.extractor.SimpleExtractorFactory; |
28 | |
import org.apache.any23.rdf.RDFUtils; |
29 | |
import org.apache.any23.vocab.CSV; |
30 | |
import org.apache.commons.csv.CSVParser; |
31 | |
import org.openrdf.model.URI; |
32 | |
import org.openrdf.model.Value; |
33 | |
import org.openrdf.model.impl.LiteralImpl; |
34 | |
import org.openrdf.model.impl.URIImpl; |
35 | |
import org.openrdf.model.vocabulary.RDF; |
36 | |
import org.openrdf.model.vocabulary.RDFS; |
37 | |
import org.openrdf.model.vocabulary.XMLSchema; |
38 | |
|
39 | |
import java.io.IOException; |
40 | |
import java.io.InputStream; |
41 | |
import java.util.Arrays; |
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | 0 | public class CSVExtractor implements Extractor.ContentExtractor { |
52 | |
|
53 | |
private CSVParser csvParser; |
54 | |
|
55 | |
private URI[] headerURIs; |
56 | |
|
57 | 0 | private CSV csv = CSV.getInstance(); |
58 | |
|
59 | 0 | public final static ExtractorFactory<CSVExtractor> factory = |
60 | |
SimpleExtractorFactory.create( |
61 | |
"csv", |
62 | |
null, |
63 | |
Arrays.asList( |
64 | |
"text/csv;q=0.1" |
65 | |
), |
66 | |
"example-csv.csv", |
67 | |
CSVExtractor.class |
68 | |
); |
69 | |
|
70 | |
|
71 | |
|
72 | |
|
73 | |
public void setStopAtFirstError(boolean f) { |
74 | 0 | } |
75 | |
|
76 | |
|
77 | |
|
78 | |
|
79 | |
public void run( |
80 | |
ExtractionParameters extractionParameters, |
81 | |
ExtractionContext extractionContext, |
82 | |
InputStream in |
83 | |
, ExtractionResult out |
84 | |
) throws IOException, ExtractionException { |
85 | 0 | final URI documentURI = extractionContext.getDocumentURI(); |
86 | |
|
87 | |
|
88 | 0 | csvParser = CSVReaderBuilder.build(in); |
89 | |
|
90 | |
|
91 | 0 | String[] header = csvParser.getLine(); |
92 | 0 | headerURIs = processHeader(header, documentURI); |
93 | |
|
94 | |
|
95 | 0 | writeHeaderPropertiesMetadata(header, out); |
96 | |
|
97 | |
String[] nextLine; |
98 | 0 | int index = 0; |
99 | 0 | while ((nextLine = csvParser.getLine()) != null) { |
100 | 0 | URI rowSubject = RDFUtils.uri( |
101 | |
documentURI.toString(), |
102 | |
"row/" + index |
103 | |
); |
104 | |
|
105 | 0 | out.writeTriple(rowSubject, RDF.TYPE, csv.rowType); |
106 | |
|
107 | 0 | produceRowStatements(rowSubject, nextLine, out); |
108 | |
|
109 | 0 | out.writeTriple(documentURI, csv.row, rowSubject); |
110 | |
|
111 | 0 | out.writeTriple( |
112 | |
rowSubject, |
113 | |
csv.rowPosition, |
114 | |
new LiteralImpl(String.valueOf(index)) |
115 | |
); |
116 | 0 | index++; |
117 | 0 | } |
118 | |
|
119 | 0 | addTableMetadataStatements( |
120 | |
documentURI, |
121 | |
out, |
122 | |
index, |
123 | |
headerURIs.length |
124 | |
); |
125 | 0 | } |
126 | |
|
127 | |
|
128 | |
|
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
private boolean isInteger(String number) { |
134 | |
try { |
135 | 0 | Integer.valueOf(number); |
136 | 0 | return true; |
137 | 0 | } catch (NumberFormatException e) { |
138 | 0 | return false; |
139 | |
} |
140 | |
} |
141 | |
|
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | |
|
147 | |
|
148 | |
private boolean isFloat(String number) { |
149 | |
try { |
150 | 0 | Float.valueOf(number); |
151 | 0 | return true; |
152 | 0 | } catch (NumberFormatException e) { |
153 | 0 | return false; |
154 | |
} |
155 | |
} |
156 | |
|
157 | |
|
158 | |
|
159 | |
|
160 | |
|
161 | |
|
162 | |
|
163 | |
private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) { |
164 | 0 | int index = 0; |
165 | 0 | for (URI singleHeader : headerURIs) { |
166 | 0 | if (index > headerURIs.length) { |
167 | 0 | break; |
168 | |
} |
169 | 0 | if (!RDFUtils.isAbsoluteURI(header[index])) { |
170 | 0 | out.writeTriple( |
171 | |
singleHeader, |
172 | |
RDFS.LABEL, |
173 | |
new LiteralImpl(header[index]) |
174 | |
); |
175 | |
} |
176 | 0 | out.writeTriple( |
177 | |
singleHeader, |
178 | |
csv.columnPosition, |
179 | |
new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER) |
180 | |
); |
181 | 0 | index++; |
182 | |
} |
183 | 0 | } |
184 | |
|
185 | |
|
186 | |
|
187 | |
|
188 | |
|
189 | |
|
190 | |
|
191 | |
|
192 | |
|
193 | |
private URI[] processHeader(String[] header, URI documentURI) { |
194 | 0 | URI[] result = new URI[header.length]; |
195 | 0 | int index = 0; |
196 | 0 | for (String h : header) { |
197 | 0 | String candidate = h.trim(); |
198 | 0 | if (RDFUtils.isAbsoluteURI(candidate)) { |
199 | 0 | result[index] = new URIImpl(candidate); |
200 | |
} else { |
201 | 0 | result[index] = normalize(candidate, documentURI); |
202 | |
} |
203 | 0 | index++; |
204 | |
} |
205 | 0 | return result; |
206 | |
} |
207 | |
|
208 | |
private URI normalize(String toBeNormalized, URI documentURI) { |
209 | 0 | String candidate = toBeNormalized; |
210 | 0 | candidate = candidate.trim().toLowerCase().replace("?", "").replace("&", ""); |
211 | 0 | String[] tokens = candidate.split(" "); |
212 | 0 | candidate = tokens[0]; |
213 | 0 | for (int i = 1; i < tokens.length; i++) { |
214 | 0 | String firstChar = ("" + tokens[i].charAt(0)).toUpperCase(); |
215 | 0 | candidate += firstChar + tokens[i].substring(1); |
216 | |
} |
217 | 0 | return new URIImpl(documentURI.toString() + candidate); |
218 | |
} |
219 | |
|
220 | |
|
221 | |
|
222 | |
|
223 | |
|
224 | |
|
225 | |
|
226 | |
|
227 | |
|
228 | |
|
229 | |
private void produceRowStatements( |
230 | |
URI rowSubject, |
231 | |
String[] values, |
232 | |
ExtractionResult out |
233 | |
) { |
234 | 0 | int index = 0; |
235 | 0 | for (String cell : values) { |
236 | 0 | if (index >= headerURIs.length) { |
237 | |
|
238 | 0 | break; |
239 | |
} |
240 | 0 | if (cell.equals("")) { |
241 | 0 | continue; |
242 | |
} |
243 | 0 | URI predicate = headerURIs[index]; |
244 | 0 | Value object = getObjectFromCell(cell); |
245 | 0 | out.writeTriple(rowSubject, predicate, object); |
246 | 0 | index++; |
247 | |
} |
248 | 0 | } |
249 | |
|
250 | |
private Value getObjectFromCell(String cell) { |
251 | |
Value object; |
252 | 0 | cell = cell.trim(); |
253 | 0 | if (RDFUtils.isAbsoluteURI(cell)) { |
254 | 0 | object = new URIImpl(cell); |
255 | |
} else { |
256 | 0 | URI datatype = XMLSchema.STRING; |
257 | 0 | if (isInteger(cell)) { |
258 | 0 | datatype = XMLSchema.INTEGER; |
259 | 0 | } else if(isFloat(cell)) { |
260 | 0 | datatype = XMLSchema.FLOAT; |
261 | |
} |
262 | 0 | object = new LiteralImpl(cell, datatype); |
263 | |
} |
264 | 0 | return object; |
265 | |
} |
266 | |
|
267 | |
|
268 | |
|
269 | |
|
270 | |
|
271 | |
|
272 | |
|
273 | |
|
274 | |
|
275 | |
|
276 | |
private void addTableMetadataStatements( |
277 | |
URI documentURI, |
278 | |
ExtractionResult out, |
279 | |
int numberOfRows, |
280 | |
int numberOfColumns) { |
281 | 0 | out.writeTriple( |
282 | |
documentURI, |
283 | |
csv.numberOfRows, |
284 | |
new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER) |
285 | |
); |
286 | 0 | out.writeTriple( |
287 | |
documentURI, |
288 | |
csv.numberOfColumns, |
289 | |
new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER) |
290 | |
); |
291 | 0 | } |
292 | |
|
293 | |
|
294 | |
|
295 | |
|
296 | |
public ExtractorDescription getDescription() { |
297 | 0 | return factory; |
298 | |
} |
299 | |
} |