| 1 |
/**
|
| 2 |
* Licensed to the Apache Software Foundation (ASF) under one or more
|
| 3 |
* contributor license agreements. See the NOTICE file distributed with
|
| 4 |
* this work for additional information regarding copyright ownership.
|
| 5 |
* The ASF licenses this file to You under the Apache License, Version 2.0
|
| 6 |
* (the "License"); you may not use this file except in compliance with
|
| 7 |
* the License. You may obtain a copy of the License at
|
| 8 |
*
|
| 9 |
* http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
*
|
| 11 |
* Unless required by applicable law or agreed to in writing, software
|
| 12 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
* See the License for the specific language governing permissions and
|
| 15 |
* limitations under the License.
|
| 16 |
*/
|
| 17 |
package org.apache.nutch.parse.msexcel;
|
| 18 |
|
| 19 |
// JDK imports
|
| 20 |
import java.io.InputStream;
|
| 21 |
|
| 22 |
// Jakarta POI imports
|
| 23 |
import org.apache.poi.hssf.usermodel.HSSFCell;
|
| 24 |
import org.apache.poi.hssf.usermodel.HSSFRow;
|
| 25 |
import org.apache.poi.hssf.usermodel.HSSFSheet;
|
| 26 |
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
| 27 |
|
| 28 |
// Nutch imports
|
| 29 |
import org.apache.nutch.parse.ms.MSExtractor;
|
| 30 |
|
| 31 |
|
| 32 |
/**
|
| 33 |
* Excel Text and Properties extractor.
|
| 34 |
*
|
| 35 |
* @author Rohit Kulkarni & Ashish Vaidya
|
| 36 |
* @author Jérôme Charron
|
| 37 |
*/
|
| 38 |
class ExcelExtractor extends MSExtractor {
|
| 39 |
|
| 40 |
|
| 41 |
protected String extractText(InputStream input) throws Exception {
|
| 42 |
|
| 43 |
StringBuilder resultText = new StringBuilder();
|
| 44 |
HSSFWorkbook wb = new HSSFWorkbook(input);
|
| 45 |
if (wb == null) {
|
| 46 |
return resultText.toString();
|
| 47 |
}
|
| 48 |
|
| 49 |
HSSFSheet sheet;
|
| 50 |
HSSFRow row;
|
| 51 |
HSSFCell cell;
|
| 52 |
int sNum = 0;
|
| 53 |
int rNum = 0;
|
| 54 |
int cNum = 0;
|
| 55 |
|
| 56 |
sNum = wb.getNumberOfSheets();
|
| 57 |
|
| 58 |
for (int i=0; i<sNum; i++) {
|
| 59 |
if ((sheet = wb.getSheetAt(i)) == null) {
|
| 60 |
continue;
|
| 61 |
}
|
| 62 |
rNum = sheet.getLastRowNum();
|
| 63 |
for (int j=0; j<=rNum; j++) {
|
| 64 |
if ((row = sheet.getRow(j)) == null){
|
| 65 |
continue;
|
| 66 |
}
|
| 67 |
cNum = row.getLastCellNum();
|
| 68 |
|
| 69 |
for (int k=0; k<cNum; k++) {
|
| 70 |
if ((cell = row.getCell((short) k)) != null) {
|
| 71 |
/*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
|
| 72 |
resultText.append(cell.getDateCellValue().toString())
|
| 73 |
} else
|
| 74 |
*/
|
| 75 |
if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
|
| 76 |
resultText.append(cell.getStringCellValue()).append(" ");
|
| 77 |
} else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
|
| 78 |
double d = cell.getNumericCellValue();
|
| 79 |
resultText.append(d).append(" ");
|
| 80 |
}
|
| 81 |
/* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
|
| 82 |
resultText.append(cell.getCellFormula());
|
| 83 |
}
|
| 84 |
*/
|
| 85 |
}
|
| 86 |
}
|
| 87 |
}
|
| 88 |
}
|
| 89 |
return resultText.toString();
|
| 90 |
}
|
| 91 |
|
| 92 |
}
|