1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.servlet; |
19 | |
|
20 | |
import org.apache.any23.Any23; |
21 | |
import org.apache.any23.ExtractionReport; |
22 | |
import org.apache.any23.extractor.ExtractionException; |
23 | |
import org.apache.any23.extractor.ExtractionParameters; |
24 | |
import org.apache.any23.filter.IgnoreAccidentalRDFa; |
25 | |
import org.apache.any23.source.DocumentSource; |
26 | |
import org.apache.any23.validator.SerializationException; |
27 | |
import org.apache.any23.validator.ValidationReport; |
28 | |
import org.apache.any23.validator.XMLValidationReportSerializer; |
29 | |
import org.apache.any23.writer.CompositeTripleHandler; |
30 | |
import org.apache.any23.writer.CountingTripleHandler; |
31 | |
import org.apache.any23.writer.FormatWriter; |
32 | |
import org.apache.any23.writer.ReportingTripleHandler; |
33 | |
import org.apache.any23.writer.TripleHandler; |
34 | |
import org.apache.any23.writer.WriterRegistry; |
35 | |
import sun.security.validator.ValidatorException; |
36 | |
|
37 | |
import javax.servlet.ServletOutputStream; |
38 | |
import javax.servlet.http.HttpServletResponse; |
39 | |
import java.io.ByteArrayOutputStream; |
40 | |
import java.io.IOException; |
41 | |
import java.io.PrintStream; |
42 | |
import java.nio.charset.Charset; |
43 | |
import java.util.ArrayList; |
44 | |
import java.util.List; |
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
class WebResponder { |
51 | |
|
52 | 0 | private static final WriterRegistry writerRegistry = WriterRegistry.getInstance(); |
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
private final Any23 runner; |
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
private Servlet any23servlet; |
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
private HttpServletResponse response; |
68 | |
|
69 | |
|
70 | |
|
71 | |
|
72 | 0 | private TripleHandler rdfWriter = null; |
73 | |
|
74 | |
|
75 | |
|
76 | |
|
77 | 0 | private ReportingTripleHandler reporter = null; |
78 | |
|
79 | |
|
80 | |
|
81 | |
|
82 | 0 | private String outputMediaType = null; |
83 | |
|
84 | |
|
85 | |
|
86 | |
|
87 | 0 | private ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream(); |
88 | |
|
89 | 0 | public WebResponder(Servlet any23servlet, HttpServletResponse response) { |
90 | 0 | this.any23servlet = any23servlet; |
91 | 0 | this.response = response; |
92 | 0 | this.runner = new Any23(); |
93 | 0 | runner.setHTTPUserAgent("Any23-Servlet"); |
94 | 0 | } |
95 | |
|
96 | |
protected Any23 getRunner() { |
97 | 0 | return runner; |
98 | |
} |
99 | |
|
100 | |
public void runExtraction( |
101 | |
DocumentSource in, |
102 | |
ExtractionParameters eps, |
103 | |
String format, |
104 | |
boolean report, boolean annotate |
105 | |
) throws IOException { |
106 | 0 | if (in == null) return; |
107 | 0 | if (!initRdfWriter(format, report, annotate)) return; |
108 | |
final ExtractionReport er; |
109 | |
try { |
110 | 0 | er = runner.extract(eps, in, rdfWriter); |
111 | 0 | rdfWriter.close(); |
112 | 0 | if (! er.hasMatchingExtractors() ) { |
113 | 0 | sendError( |
114 | |
415, |
115 | |
"No suitable extractor found for this media type", |
116 | |
null, |
117 | |
er.getValidationReport(), |
118 | |
report |
119 | |
); |
120 | 0 | return; |
121 | |
} |
122 | 0 | } catch (IOException ioe) { |
123 | |
|
124 | 0 | if (ioe.getCause() != null && ValidatorException.class.equals(ioe.getCause().getClass())) { |
125 | 0 | final String errMsg = "Could not fetch input, IO Error."; |
126 | 0 | any23servlet.log(errMsg, ioe.getCause()); |
127 | 0 | sendError(502, errMsg, ioe, null, report); |
128 | 0 | return; |
129 | |
} |
130 | 0 | any23servlet.log("Could not fetch input", ioe); |
131 | 0 | sendError(502, "Could not fetch input.", ioe, null, report); |
132 | 0 | return; |
133 | 0 | } catch (ExtractionException e) { |
134 | |
|
135 | 0 | any23servlet.log("Could not parse input", e); |
136 | 0 | sendError(502, "Could not parse input.", e, null, report); |
137 | 0 | return; |
138 | 0 | } catch (Exception e) { |
139 | 0 | any23servlet.log("Internal error", e); |
140 | 0 | sendError(500, "Internal error.", e, null, report); |
141 | 0 | return; |
142 | 0 | } |
143 | |
|
144 | |
|
145 | 0 | any23servlet.log("Extraction complete, " + reporter.getTotalTriples() + " triples"); |
146 | 0 | if (reporter.getTotalTriples() == 0) { |
147 | 0 | sendError( |
148 | |
501, |
149 | |
"Extraction completed. No triples have been found.", |
150 | |
null, |
151 | |
er.getValidationReport(), report |
152 | |
); |
153 | 0 | return; |
154 | |
} |
155 | |
|
156 | |
|
157 | 0 | response.setContentType(outputMediaType); |
158 | 0 | response.setStatus(200); |
159 | |
|
160 | 0 | final String charsetEncoding = er.getEncoding(); |
161 | 0 | if (Charset.isSupported(charsetEncoding)) { |
162 | 0 | response.setCharacterEncoding(er.getEncoding()); |
163 | |
} else { |
164 | 0 | response.setCharacterEncoding("UTF-8"); |
165 | |
} |
166 | |
|
167 | 0 | final ServletOutputStream sos = response.getOutputStream(); |
168 | 0 | final byte[] data = byteOutStream.toByteArray(); |
169 | 0 | if(report) { |
170 | 0 | final PrintStream ps = new PrintStream(sos); |
171 | |
try { |
172 | 0 | printHeader(ps); |
173 | 0 | printResponse(reporter, er.getValidationReport(), data, ps); |
174 | 0 | } catch (Exception e) { |
175 | 0 | throw new RuntimeException("An error occurred while serializing the output response.", e); |
176 | |
} finally { |
177 | 0 | ps.close(); |
178 | 0 | } |
179 | 0 | } else { |
180 | 0 | sos.write(data); |
181 | |
} |
182 | 0 | } |
183 | |
|
184 | |
public void sendError(int code, String msg, boolean report) throws IOException { |
185 | 0 | sendError(code, msg, null, null, report); |
186 | 0 | } |
187 | |
|
188 | |
private void printHeader(PrintStream ps) { |
189 | 0 | ps.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); |
190 | 0 | } |
191 | |
|
192 | |
private void printResponse(ReportingTripleHandler rth, ValidationReport vr, byte[] data, PrintStream ps) { |
193 | 0 | ps.println("<response>"); |
194 | 0 | printExtractors(rth, ps); |
195 | 0 | printReport(null, null, vr, ps); |
196 | 0 | printData(data, ps); |
197 | 0 | ps.println("</response>"); |
198 | 0 | } |
199 | |
|
200 | |
private void printExtractors(ReportingTripleHandler rth, PrintStream ps) { |
201 | 0 | ps.println("<extractors>"); |
202 | 0 | for (String extractor : rth.getExtractorNames()) { |
203 | 0 | ps.print("<extractor>"); |
204 | 0 | ps.print(extractor); |
205 | 0 | ps.println("</extractor>"); |
206 | |
} |
207 | 0 | ps.println("</extractors>"); |
208 | 0 | } |
209 | |
|
210 | |
private void printReport(String msg, Throwable e, ValidationReport vr, PrintStream ps) { |
211 | 0 | XMLValidationReportSerializer reportSerializer = new XMLValidationReportSerializer(); |
212 | 0 | ps.println("<report>"); |
213 | 0 | ps.printf("<message>%s</message>\n", msg == null ? "" : msg); |
214 | 0 | ps.println("<error>"); |
215 | 0 | if(e != null) { |
216 | 0 | ps.println("<![CDATA["); |
217 | 0 | e.printStackTrace(ps); |
218 | 0 | ps.println("]]>"); |
219 | |
} |
220 | 0 | ps.println("</error>"); |
221 | |
|
222 | |
try { |
223 | 0 | reportSerializer.serialize(vr, ps); |
224 | 0 | } catch (SerializationException se) { |
225 | 0 | ps.println("An error occurred while serializing error."); |
226 | 0 | se.printStackTrace(ps); |
227 | 0 | } |
228 | |
|
229 | 0 | ps.println("</report>"); |
230 | 0 | } |
231 | |
|
232 | |
private void printData(byte[] data, PrintStream ps) { |
233 | 0 | ps.println("<data>"); |
234 | 0 | ps.println("<![CDATA["); |
235 | |
try { |
236 | 0 | ps.write(data); |
237 | 0 | } catch (IOException ioe) { |
238 | 0 | ps.println("An error occurred while serializing data."); |
239 | 0 | ioe.printStackTrace(ps); |
240 | 0 | } |
241 | 0 | ps.println("]]>"); |
242 | 0 | ps.println("</data>"); |
243 | 0 | } |
244 | |
|
245 | |
private void sendError(int code, String msg, Exception e, ValidationReport vr, boolean report) |
246 | |
throws IOException { |
247 | 0 | response.setStatus(code); |
248 | 0 | response.setContentType("text/plain"); |
249 | 0 | final PrintStream ps = new PrintStream(response.getOutputStream()); |
250 | 0 | if (report) { |
251 | |
try { |
252 | 0 | printHeader(ps); |
253 | 0 | printReport(msg, e, vr, ps); |
254 | |
} finally { |
255 | 0 | ps.close(); |
256 | 0 | } |
257 | |
} else { |
258 | 0 | ps.println(msg); |
259 | 0 | if (e != null) { |
260 | 0 | ps.println("================================================================"); |
261 | 0 | e.printStackTrace(ps); |
262 | 0 | ps.println("================================================================"); |
263 | |
} |
264 | |
} |
265 | 0 | } |
266 | |
|
267 | |
private boolean initRdfWriter(String format, boolean report, boolean annotate) throws IOException { |
268 | 0 | final FormatWriter fw = getFormatWriter(format, annotate); |
269 | 0 | if (fw == null) { |
270 | 0 | sendError( |
271 | |
400, |
272 | |
"Invalid format '" + format + "', try one of: [rdfxml, turtle, ntriples, nquads, trix, json]", |
273 | |
null, |
274 | |
null, |
275 | |
report |
276 | |
); |
277 | 0 | return false; |
278 | |
} |
279 | 0 | outputMediaType = WriterRegistry.getMimeType( fw.getClass() ); |
280 | 0 | List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>(); |
281 | 0 | tripleHandlers.add(new IgnoreAccidentalRDFa(fw)); |
282 | 0 | tripleHandlers.add(new CountingTripleHandler()); |
283 | 0 | rdfWriter = new CompositeTripleHandler(tripleHandlers); |
284 | 0 | reporter = new ReportingTripleHandler(rdfWriter); |
285 | 0 | rdfWriter = reporter; |
286 | 0 | return true; |
287 | |
} |
288 | |
|
289 | |
private FormatWriter getFormatWriter(String format, boolean annotate) throws IOException { |
290 | |
final String finalFormat; |
291 | 0 | if ("rdf".equals(format) || "xml".equals(format) || "rdfxml".equals(format)) { |
292 | 0 | finalFormat = "rdfxml"; |
293 | 0 | } else if ("turtle".equals(format) || "ttl".equals(format)) { |
294 | 0 | finalFormat = "turtle"; |
295 | 0 | } else if ("n3".equals(format)) { |
296 | 0 | finalFormat = "turtle"; |
297 | 0 | } else if ("n-triples".equals(format) || "ntriples".equals(format) || "nt".equals(format)) { |
298 | 0 | finalFormat = "ntriples"; |
299 | 0 | } else if("nquads".equals(format) || "n-quads".equals(format) || "nq".equals(format)) { |
300 | 0 | finalFormat = "nquads"; |
301 | 0 | } else if("trix".equals(format)) { |
302 | 0 | finalFormat = "trix"; |
303 | 0 | } else if("json".equals(format)) { |
304 | 0 | finalFormat = "json"; |
305 | |
} else { |
306 | 0 | return null; |
307 | |
} |
308 | 0 | final FormatWriter writer = writerRegistry.getWriterInstanceByIdentifier(finalFormat, byteOutStream); |
309 | 0 | writer.setAnnotated(annotate); |
310 | 0 | return writer; |
311 | |
} |
312 | |
|
313 | |
} |