Coverage Report

Coverage Report - org.apache.any23.plugin.officescraper.ExcelExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

ExcelExtractor

0/68

0/20

2.143

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.plugin.officescraper;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.Excel;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.openrdf.model.URI;
 import org.openrdf.model.vocabulary.RDF;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 
 /**
  * Implementation of {@link ContentExtractor} able to process
  * a <i>MS Excel 97-2007+</i> file format <i>.xls/.xlsx</i> and
  * convert the detected content to triples.
  * This extractor is based on
  * <a href="http://poi.apache.org/spreadsheet/index.html">Apache POI-HSSF and POI-XSSF Java API</a>.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
  */
 public class ExcelExtractor implements Extractor.ContentExtractor {
 
     private static final Excel excel = Excel.getInstance();
 
     private boolean stopAtFirstError = false;
 
     public final static ExtractorFactory<ExcelExtractor> factory =
             SimpleExtractorFactory.create(
                     "excel",
                     null,
                     Arrays.asList(
                             "application/vnd.ms-excel;q=0.1",
                             "application/msexcel;q=0.1",
                             "application/x-msexcel;q=0.1",
                             "application/x-ms-excel;q=0.1"
                     ),
                     null,
                     ExcelExtractor.class
             );
 
     public ExcelExtractor() {}
 
     public boolean isStopAtFirstError() {
         return stopAtFirstError;
     }
 
     @Override
     public void setStopAtFirstError(boolean f) {
         stopAtFirstError = f;
     }
 
     @Override
     public ExtractorDescription getDescription() {
         return factory;
     }
 
     @Override
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext context,
             InputStream in,
             ExtractionResult er
     ) throws IOException, ExtractionException {
         try {
             final URI documentURI = context.getDocumentURI();
             final Workbook workbook = createWorkbook(documentURI, in);
             processWorkbook(documentURI, workbook, er);
         } catch (Exception e) {
             throw new ExtractionException("An error occurred while extracting MS Excel content.", e);
         }
     }
 
     // TODO: this should be done by Tika, the extractors should be split.
     private Workbook createWorkbook(URI document, InputStream is) throws IOException {
         final String documentURI = document.toString();
         if(documentURI.endsWith(".xlsx")) {
             return new XSSFWorkbook(is);
         } else if(documentURI.endsWith("xls")) {
             return new HSSFWorkbook(is);
         } else {
             throw new IllegalArgumentException("Unsupported extension for resource [" + documentURI + "]");
         }
     }
 
     private void processWorkbook(URI documentURI, Workbook wb, ExtractionResult er) {
         for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
             final Sheet sheet = wb.getSheetAt(sheetIndex);
             final URI sheetURI = getSheetURI(documentURI, sheet);
             er.writeTriple(documentURI, excel.containsSheet, sheetURI);
             er.writeTriple(sheetURI, RDF.TYPE, excel.sheet);
             writeSheetMetadata(sheetURI, sheet, er);
             for (Row row : sheet) {
                 final URI rowURI = getRowURI(sheetURI, row);
                 er.writeTriple(sheetURI, excel.containsRow, rowURI);
                 er.writeTriple(rowURI, RDF.TYPE, excel.row);
                 writeRowMetadata(rowURI, row, er);
                 for (Cell cell : row) {
                     writeCell(rowURI, cell, er);
                 }
             }
         }
     }
 
     private void writeSheetMetadata(URI sheetURI, Sheet sheet, ExtractionResult er) {
         final String sheetName   = sheet.getSheetName();
         final int    firstRowNum = sheet.getFirstRowNum();
         final int    lastRowNum  = sheet.getLastRowNum();
         er.writeTriple(sheetURI, excel.sheetName, RDFUtils.literal(sheetName));
         er.writeTriple(sheetURI, excel.firstRow, RDFUtils.literal(firstRowNum));
         er.writeTriple(sheetURI, excel.lastRow  , RDFUtils.literal(lastRowNum ));
     }
 
     private void writeRowMetadata(URI rowURI, Row row, ExtractionResult er) {
         final int    firstCellNum = row.getFirstCellNum();
         final int    lastCellNum  = row.getLastCellNum();
         er.writeTriple(rowURI, excel.firstCell , RDFUtils.literal(firstCellNum));
         er.writeTriple(rowURI, excel.lastCell  , RDFUtils.literal(lastCellNum ));
     }
 
     private void writeCell(URI rowURI, Cell cell, ExtractionResult er) {
         final URI cellType = cellTypeToType(cell.getCellType());
         if(cellType == null) return; // Skip unsupported cells.
         final URI cellURI = getCellURI(rowURI, cell);
         er.writeTriple(rowURI, excel.containsCell, cellURI);
         er.writeTriple(cellURI, RDF.TYPE, excel.cell);
         er.writeTriple(
                 cellURI,
                 excel.cellValue,
                 RDFUtils.literal(cell.getStringCellValue(), cellType)
         );
     }
 
     private URI getSheetURI(URI documentURI, Sheet sheet) {
         return RDFUtils.uri( documentURI.toString() + "/sheet/" + sheet.getSheetName() );
     }
 
     private URI getRowURI(URI sheetURI, Row row) {
         return  RDFUtils.uri( sheetURI.toString() + "/" + row.getRowNum() );
     }
 
     private URI getCellURI(URI rowURI, Cell cell) {
         return RDFUtils.uri(
             rowURI +
             String.format("/%d/", cell.getColumnIndex())
         );
     }
 
     private URI cellTypeToType(int cellType) {
         final String postfix;
         switch (cellType) {
             case Cell.CELL_TYPE_STRING:
                 postfix = "string";
                 break;
             case Cell.CELL_TYPE_BOOLEAN:
                 postfix = "boolean";
                 break;
             case Cell.CELL_TYPE_NUMERIC:
                 postfix = "numeric";
                 break;
             default:
                 postfix = null;
         }
         return postfix == null ? null : RDFUtils.uri(excel.getNamespace().toString() + postfix);
     }
 
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.plugin.officescraper;
19
20		import org.apache.any23.extractor.ExtractionContext;
21		import org.apache.any23.extractor.ExtractionException;
22		import org.apache.any23.extractor.ExtractionParameters;
23		import org.apache.any23.extractor.ExtractionResult;
24		import org.apache.any23.extractor.Extractor;
25		import org.apache.any23.extractor.ExtractorDescription;
26		import org.apache.any23.extractor.ExtractorFactory;
27		import org.apache.any23.extractor.SimpleExtractorFactory;
28		import org.apache.any23.rdf.RDFUtils;
29		import org.apache.any23.vocab.Excel;
30		import org.apache.poi.hssf.usermodel.HSSFWorkbook;
31		import org.apache.poi.ss.usermodel.Cell;
32		import org.apache.poi.ss.usermodel.Row;
33		import org.apache.poi.ss.usermodel.Sheet;
34		import org.apache.poi.ss.usermodel.Workbook;
35		import org.apache.poi.xssf.usermodel.XSSFWorkbook;
36		import org.openrdf.model.URI;
37		import org.openrdf.model.vocabulary.RDF;
38
39		import java.io.IOException;
40		import java.io.InputStream;
41		import java.util.Arrays;
42
43		/**
44		* Implementation of {@link ContentExtractor} able to process
45		* a <i>MS Excel 97-2007+</i> file format <i>.xls/.xlsx</i> and
46		* convert the detected content to triples.
47		* This extractor is based on
48		* <a href="http://poi.apache.org/spreadsheet/index.html">Apache POI-HSSF and POI-XSSF Java API</a>.
49		*
50		* @author Michele Mostarda (mostarda@fbk.eu)
51		*/
52	0	public class ExcelExtractor implements Extractor.ContentExtractor {
53
54	0	private static final Excel excel = Excel.getInstance();
55
56	0	private boolean stopAtFirstError = false;
57
58	0	public final static ExtractorFactory<ExcelExtractor> factory =
59		SimpleExtractorFactory.create(
60		"excel",
61		null,
62		Arrays.asList(
63		"application/vnd.ms-excel;q=0.1",
64		"application/msexcel;q=0.1",
65		"application/x-msexcel;q=0.1",
66		"application/x-ms-excel;q=0.1"
67		),
68		null,
69		ExcelExtractor.class
70		);
71
72	0	public ExcelExtractor() {}
73
74		public boolean isStopAtFirstError() {
75	0	return stopAtFirstError;
76		}
77
78		@Override
79		public void setStopAtFirstError(boolean f) {
80	0	stopAtFirstError = f;
81	0	}
82
83		@Override
84		public ExtractorDescription getDescription() {
85	0	return factory;
86		}
87
88		@Override
89		public void run(
90		ExtractionParameters extractionParameters,
91		ExtractionContext context,
92		InputStream in,
93		ExtractionResult er
94		) throws IOException, ExtractionException {
95		try {
96	0	final URI documentURI = context.getDocumentURI();
97	0	final Workbook workbook = createWorkbook(documentURI, in);
98	0	processWorkbook(documentURI, workbook, er);
99	0	} catch (Exception e) {
100	0	throw new ExtractionException("An error occurred while extracting MS Excel content.", e);
101	0	}
102	0	}
103
104		// TODO: this should be done by Tika, the extractors should be split.
105		private Workbook createWorkbook(URI document, InputStream is) throws IOException {
106	0	final String documentURI = document.toString();
107	0	if(documentURI.endsWith(".xlsx")) {
108	0	return new XSSFWorkbook(is);
109	0	} else if(documentURI.endsWith("xls")) {
110	0	return new HSSFWorkbook(is);
111		} else {
112	0	throw new IllegalArgumentException("Unsupported extension for resource [" + documentURI + "]");
113		}
114		}
115
116		private void processWorkbook(URI documentURI, Workbook wb, ExtractionResult er) {
117	0	for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
118	0	final Sheet sheet = wb.getSheetAt(sheetIndex);
119	0	final URI sheetURI = getSheetURI(documentURI, sheet);
120	0	er.writeTriple(documentURI, excel.containsSheet, sheetURI);
121	0	er.writeTriple(sheetURI, RDF.TYPE, excel.sheet);
122	0	writeSheetMetadata(sheetURI, sheet, er);
123	0	for (Row row : sheet) {
124	0	final URI rowURI = getRowURI(sheetURI, row);
125	0	er.writeTriple(sheetURI, excel.containsRow, rowURI);
126	0	er.writeTriple(rowURI, RDF.TYPE, excel.row);
127	0	writeRowMetadata(rowURI, row, er);
128	0	for (Cell cell : row) {
129	0	writeCell(rowURI, cell, er);
130		}
131	0	}
132		}
133	0	}
134
135		private void writeSheetMetadata(URI sheetURI, Sheet sheet, ExtractionResult er) {
136	0	final String sheetName = sheet.getSheetName();
137	0	final int firstRowNum = sheet.getFirstRowNum();
138	0	final int lastRowNum = sheet.getLastRowNum();
139	0	er.writeTriple(sheetURI, excel.sheetName, RDFUtils.literal(sheetName));
140	0	er.writeTriple(sheetURI, excel.firstRow, RDFUtils.literal(firstRowNum));
141	0	er.writeTriple(sheetURI, excel.lastRow , RDFUtils.literal(lastRowNum ));
142	0	}
143
144		private void writeRowMetadata(URI rowURI, Row row, ExtractionResult er) {
145	0	final int firstCellNum = row.getFirstCellNum();
146	0	final int lastCellNum = row.getLastCellNum();
147	0	er.writeTriple(rowURI, excel.firstCell , RDFUtils.literal(firstCellNum));
148	0	er.writeTriple(rowURI, excel.lastCell , RDFUtils.literal(lastCellNum ));
149	0	}
150
151		private void writeCell(URI rowURI, Cell cell, ExtractionResult er) {
152	0	final URI cellType = cellTypeToType(cell.getCellType());
153	0	if(cellType == null) return; // Skip unsupported cells.
154	0	final URI cellURI = getCellURI(rowURI, cell);
155	0	er.writeTriple(rowURI, excel.containsCell, cellURI);
156	0	er.writeTriple(cellURI, RDF.TYPE, excel.cell);
157	0	er.writeTriple(
158		cellURI,
159		excel.cellValue,
160		RDFUtils.literal(cell.getStringCellValue(), cellType)
161		);
162	0	}
163
164		private URI getSheetURI(URI documentURI, Sheet sheet) {
165	0	return RDFUtils.uri( documentURI.toString() + "/sheet/" + sheet.getSheetName() );
166		}
167
168		private URI getRowURI(URI sheetURI, Row row) {
169	0	return RDFUtils.uri( sheetURI.toString() + "/" + row.getRowNum() );
170		}
171
172		private URI getCellURI(URI rowURI, Cell cell) {
173	0	return RDFUtils.uri(
174		rowURI +
175		String.format("/%d/", cell.getColumnIndex())
176		);
177		}
178
179		private URI cellTypeToType(int cellType) {
180		final String postfix;
181	0	switch (cellType) {
182		case Cell.CELL_TYPE_STRING:
183	0	postfix = "string";
184	0	break;
185		case Cell.CELL_TYPE_BOOLEAN:
186	0	postfix = "boolean";
187	0	break;
188		case Cell.CELL_TYPE_NUMERIC:
189	0	postfix = "numeric";
190	0	break;
191		default:
192	0	postfix = null;
193		}
194	0	return postfix == null ? null : RDFUtils.uri(excel.getNamespace().toString() + postfix);
195		}
196
197
198		}