Coverage Report - org.apache.any23.plugin.officescraper.ExcelExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
ExcelExtractor
0%
0/68
0%
0/20
2.143
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.plugin.officescraper;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionContext;
 21  
 import org.apache.any23.extractor.ExtractionException;
 22  
 import org.apache.any23.extractor.ExtractionParameters;
 23  
 import org.apache.any23.extractor.ExtractionResult;
 24  
 import org.apache.any23.extractor.Extractor;
 25  
 import org.apache.any23.extractor.ExtractorDescription;
 26  
 import org.apache.any23.extractor.ExtractorFactory;
 27  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 28  
 import org.apache.any23.rdf.RDFUtils;
 29  
 import org.apache.any23.vocab.Excel;
 30  
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 31  
 import org.apache.poi.ss.usermodel.Cell;
 32  
 import org.apache.poi.ss.usermodel.Row;
 33  
 import org.apache.poi.ss.usermodel.Sheet;
 34  
 import org.apache.poi.ss.usermodel.Workbook;
 35  
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 36  
 import org.openrdf.model.URI;
 37  
 import org.openrdf.model.vocabulary.RDF;
 38  
 
 39  
 import java.io.IOException;
 40  
 import java.io.InputStream;
 41  
 import java.util.Arrays;
 42  
 
 43  
 /**
 44  
  * Implementation of {@link ContentExtractor} able to process
 45  
  * a <i>MS Excel 97-2007+</i> file format <i>.xls/.xlsx</i> and
 46  
  * convert the detected content to triples.
 47  
  * This extractor is based on
 48  
  * <a href="http://poi.apache.org/spreadsheet/index.html">Apache POI-HSSF and POI-XSSF Java API</a>.
 49  
  *
 50  
  * @author Michele Mostarda (mostarda@fbk.eu)
 51  
  */
 52  0
 public class ExcelExtractor implements Extractor.ContentExtractor {
 53  
 
 54  0
     private static final Excel excel = Excel.getInstance();
 55  
 
 56  0
     private boolean stopAtFirstError = false;
 57  
 
 58  0
     public final static ExtractorFactory<ExcelExtractor> factory =
 59  
             SimpleExtractorFactory.create(
 60  
                     "excel",
 61  
                     null,
 62  
                     Arrays.asList(
 63  
                             "application/vnd.ms-excel;q=0.1",
 64  
                             "application/msexcel;q=0.1",
 65  
                             "application/x-msexcel;q=0.1",
 66  
                             "application/x-ms-excel;q=0.1"
 67  
                     ),
 68  
                     null,
 69  
                     ExcelExtractor.class
 70  
             );
 71  
 
 72  0
     public ExcelExtractor() {}
 73  
 
 74  
     public boolean isStopAtFirstError() {
 75  0
         return stopAtFirstError;
 76  
     }
 77  
 
 78  
     @Override
 79  
     public void setStopAtFirstError(boolean f) {
 80  0
         stopAtFirstError = f;
 81  0
     }
 82  
 
 83  
     @Override
 84  
     public ExtractorDescription getDescription() {
 85  0
         return factory;
 86  
     }
 87  
 
 88  
     @Override
 89  
     public void run(
 90  
             ExtractionParameters extractionParameters,
 91  
             ExtractionContext context,
 92  
             InputStream in,
 93  
             ExtractionResult er
 94  
     ) throws IOException, ExtractionException {
 95  
         try {
 96  0
             final URI documentURI = context.getDocumentURI();
 97  0
             final Workbook workbook = createWorkbook(documentURI, in);
 98  0
             processWorkbook(documentURI, workbook, er);
 99  0
         } catch (Exception e) {
 100  0
             throw new ExtractionException("An error occurred while extracting MS Excel content.", e);
 101  0
         }
 102  0
     }
 103  
 
 104  
     // TODO: this should be done by Tika, the extractors should be split.
 105  
     private Workbook createWorkbook(URI document, InputStream is) throws IOException {
 106  0
         final String documentURI = document.toString();
 107  0
         if(documentURI.endsWith(".xlsx")) {
 108  0
             return new XSSFWorkbook(is);
 109  0
         } else if(documentURI.endsWith("xls")) {
 110  0
             return new HSSFWorkbook(is);
 111  
         } else {
 112  0
             throw new IllegalArgumentException("Unsupported extension for resource [" + documentURI + "]");
 113  
         }
 114  
     }
 115  
 
 116  
     private void processWorkbook(URI documentURI, Workbook wb, ExtractionResult er) {
 117  0
         for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
 118  0
             final Sheet sheet = wb.getSheetAt(sheetIndex);
 119  0
             final URI sheetURI = getSheetURI(documentURI, sheet);
 120  0
             er.writeTriple(documentURI, excel.containsSheet, sheetURI);
 121  0
             er.writeTriple(sheetURI, RDF.TYPE, excel.sheet);
 122  0
             writeSheetMetadata(sheetURI, sheet, er);
 123  0
             for (Row row : sheet) {
 124  0
                 final URI rowURI = getRowURI(sheetURI, row);
 125  0
                 er.writeTriple(sheetURI, excel.containsRow, rowURI);
 126  0
                 er.writeTriple(rowURI, RDF.TYPE, excel.row);
 127  0
                 writeRowMetadata(rowURI, row, er);
 128  0
                 for (Cell cell : row) {
 129  0
                     writeCell(rowURI, cell, er);
 130  
                 }
 131  0
             }
 132  
         }
 133  0
     }
 134  
 
 135  
     private void writeSheetMetadata(URI sheetURI, Sheet sheet, ExtractionResult er) {
 136  0
         final String sheetName   = sheet.getSheetName();
 137  0
         final int    firstRowNum = sheet.getFirstRowNum();
 138  0
         final int    lastRowNum  = sheet.getLastRowNum();
 139  0
         er.writeTriple(sheetURI, excel.sheetName, RDFUtils.literal(sheetName));
 140  0
         er.writeTriple(sheetURI, excel.firstRow, RDFUtils.literal(firstRowNum));
 141  0
         er.writeTriple(sheetURI, excel.lastRow  , RDFUtils.literal(lastRowNum ));
 142  0
     }
 143  
 
 144  
     private void writeRowMetadata(URI rowURI, Row row, ExtractionResult er) {
 145  0
         final int    firstCellNum = row.getFirstCellNum();
 146  0
         final int    lastCellNum  = row.getLastCellNum();
 147  0
         er.writeTriple(rowURI, excel.firstCell , RDFUtils.literal(firstCellNum));
 148  0
         er.writeTriple(rowURI, excel.lastCell  , RDFUtils.literal(lastCellNum ));
 149  0
     }
 150  
 
 151  
     private void writeCell(URI rowURI, Cell cell, ExtractionResult er) {
 152  0
         final URI cellType = cellTypeToType(cell.getCellType());
 153  0
         if(cellType == null) return; // Skip unsupported cells.
 154  0
         final URI cellURI = getCellURI(rowURI, cell);
 155  0
         er.writeTriple(rowURI, excel.containsCell, cellURI);
 156  0
         er.writeTriple(cellURI, RDF.TYPE, excel.cell);
 157  0
         er.writeTriple(
 158  
                 cellURI,
 159  
                 excel.cellValue,
 160  
                 RDFUtils.literal(cell.getStringCellValue(), cellType)
 161  
         );
 162  0
     }
 163  
 
 164  
     private URI getSheetURI(URI documentURI, Sheet sheet) {
 165  0
         return RDFUtils.uri( documentURI.toString() + "/sheet/" + sheet.getSheetName() );
 166  
     }
 167  
 
 168  
     private URI getRowURI(URI sheetURI, Row row) {
 169  0
         return  RDFUtils.uri( sheetURI.toString() + "/" + row.getRowNum() );
 170  
     }
 171  
 
 172  
     private URI getCellURI(URI rowURI, Cell cell) {
 173  0
         return RDFUtils.uri(
 174  
             rowURI +
 175  
             String.format("/%d/", cell.getColumnIndex())
 176  
         );
 177  
     }
 178  
 
 179  
     private URI cellTypeToType(int cellType) {
 180  
         final String postfix;
 181  0
         switch (cellType) {
 182  
             case Cell.CELL_TYPE_STRING:
 183  0
                 postfix = "string";
 184  0
                 break;
 185  
             case Cell.CELL_TYPE_BOOLEAN:
 186  0
                 postfix = "boolean";
 187  0
                 break;
 188  
             case Cell.CELL_TYPE_NUMERIC:
 189  0
                 postfix = "numeric";
 190  0
                 break;
 191  
             default:
 192  0
                 postfix = null;
 193  
         }
 194  0
         return postfix == null ? null : RDFUtils.uri(excel.getNamespace().toString() + postfix);
 195  
     }
 196  
 
 197  
 
 198  
 }