1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.csv;
19
20 import static java.lang.Character.toUpperCase;
21
22 import org.apache.any23.extractor.ExtractionContext;
23 import org.apache.any23.extractor.ExtractionException;
24 import org.apache.any23.extractor.ExtractionParameters;
25 import org.apache.any23.extractor.ExtractionResult;
26 import org.apache.any23.extractor.Extractor;
27 import org.apache.any23.extractor.ExtractorDescription;
28 import org.apache.any23.extractor.ExtractorFactory;
29 import org.apache.any23.extractor.SimpleExtractorFactory;
30 import org.apache.any23.rdf.RDFUtils;
31 import org.apache.any23.vocab.CSV;
32 import org.apache.commons.csv.CSVParser;
33 import org.openrdf.model.URI;
34 import org.openrdf.model.Value;
35 import org.openrdf.model.impl.LiteralImpl;
36 import org.openrdf.model.impl.URIImpl;
37 import org.openrdf.model.vocabulary.RDF;
38 import org.openrdf.model.vocabulary.RDFS;
39 import org.openrdf.model.vocabulary.XMLSchema;
40
41 import java.io.IOException;
42 import java.io.InputStream;
43 import java.util.Arrays;
44 import java.util.StringTokenizer;
45
46
47
48
49
50
51
52
53
54 public class CSVExtractor implements Extractor.ContentExtractor {
55
56 private CSVParser csvParser;
57
58 private URI[] headerURIs;
59
60 private CSV csv = CSV.getInstance();
61
62 public final static ExtractorFactory<CSVExtractor> factory =
63 SimpleExtractorFactory.create(
64 "csv",
65 null,
66 Arrays.asList(
67 "text/csv;q=0.1"
68 ),
69 "example-csv.csv",
70 CSVExtractor.class
71 );
72
73
74
75
76 public void setStopAtFirstError(boolean f) {
77 }
78
79
80
81
82 public void run(
83 ExtractionParameters extractionParameters,
84 ExtractionContext extractionContext,
85 InputStream in
86 , ExtractionResult out
87 ) throws IOException, ExtractionException {
88 final URI documentURI = extractionContext.getDocumentURI();
89
90
91 csvParser = CSVReaderBuilder.build(in);
92
93
94 String[] header = csvParser.getLine();
95 headerURIs = processHeader(header, documentURI);
96
97
98 writeHeaderPropertiesMetadata(header, out);
99
100 String[] nextLine;
101 int index = 0;
102 while ((nextLine = csvParser.getLine()) != null) {
103 URI rowSubject = RDFUtils.uri(
104 documentURI.toString(),
105 "row/" + index
106 );
107
108 out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
109
110 produceRowStatements(rowSubject, nextLine, out);
111
112 out.writeTriple(documentURI, csv.row, rowSubject);
113
114 out.writeTriple(
115 rowSubject,
116 csv.rowPosition,
117 new LiteralImpl(String.valueOf(index))
118 );
119 index++;
120 }
121
122 addTableMetadataStatements(
123 documentURI,
124 out,
125 index,
126 headerURIs.length
127 );
128 }
129
130
131
132
133
134
135
136 private boolean isInteger(String number) {
137 try {
138 Integer.valueOf(number);
139 return true;
140 } catch (NumberFormatException e) {
141 return false;
142 }
143 }
144
145
146
147
148
149
150
151 private boolean isFloat(String number) {
152 try {
153 Float.valueOf(number);
154 return true;
155 } catch (NumberFormatException e) {
156 return false;
157 }
158 }
159
160
161
162
163
164
165
166 private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
167 int index = 0;
168 for (URI singleHeader : headerURIs) {
169 if (index > headerURIs.length) {
170 break;
171 }
172 if (!RDFUtils.isAbsoluteURI(header[index])) {
173 out.writeTriple(
174 singleHeader,
175 RDFS.LABEL,
176 new LiteralImpl(header[index])
177 );
178 }
179 out.writeTriple(
180 singleHeader,
181 csv.columnPosition,
182 new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
183 );
184 index++;
185 }
186 }
187
188
189
190
191
192
193
194
195
196 private URI[] processHeader(String[] header, URI documentURI) {
197 URI[] result = new URI[header.length];
198 int index = 0;
199 for (String h : header) {
200 String candidate = h.trim();
201 if (RDFUtils.isAbsoluteURI(candidate)) {
202 result[index] = new URIImpl(candidate);
203 } else {
204 result[index] = normalize(candidate, documentURI);
205 }
206 index++;
207 }
208 return result;
209 }
210
211 private URI normalize(String toBeNormalized, URI documentURI) {
212 toBeNormalized = toBeNormalized.trim().toLowerCase().replace("?", "").replace("&", "");
213
214 StringBuilder result = new StringBuilder(documentURI.toString());
215
216 StringTokenizer tokenizer = new StringTokenizer(toBeNormalized, " ");
217 while (tokenizer.hasMoreTokens()) {
218 String current = tokenizer.nextToken();
219
220 result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
221 }
222
223 return new URIImpl(result.toString());
224 }
225
226
227
228
229
230
231
232
233
234
235 private void produceRowStatements(
236 URI rowSubject,
237 String[] values,
238 ExtractionResult out
239 ) {
240 int index = 0;
241 for (String cell : values) {
242 if (index >= headerURIs.length) {
243
244 break;
245 }
246 if (cell.equals("")) {
247 continue;
248 }
249 URI predicate = headerURIs[index];
250 Value object = getObjectFromCell(cell);
251 out.writeTriple(rowSubject, predicate, object);
252 index++;
253 }
254 }
255
256 private Value getObjectFromCell(String cell) {
257 Value object;
258 cell = cell.trim();
259 if (RDFUtils.isAbsoluteURI(cell)) {
260 object = new URIImpl(cell);
261 } else {
262 URI datatype = XMLSchema.STRING;
263 if (isInteger(cell)) {
264 datatype = XMLSchema.INTEGER;
265 } else if(isFloat(cell)) {
266 datatype = XMLSchema.FLOAT;
267 }
268 object = new LiteralImpl(cell, datatype);
269 }
270 return object;
271 }
272
273
274
275
276
277
278
279
280
281
282 private void addTableMetadataStatements(
283 URI documentURI,
284 ExtractionResult out,
285 int numberOfRows,
286 int numberOfColumns) {
287 out.writeTriple(
288 documentURI,
289 csv.numberOfRows,
290 new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
291 );
292 out.writeTriple(
293 documentURI,
294 csv.numberOfColumns,
295 new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
296 );
297 }
298
299
300
301
302 public ExtractorDescription getDescription() {
303 return factory;
304 }
305 }