You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/26 19:03:11 UTC
svn commit: r641446 -
/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Author: jukka
Date: Wed Mar 26 11:03:01 2008
New Revision: 641446
URL: http://svn.apache.org/viewvc?rev=641446&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
- Use a TreeMap instead of custom linked lists for the sparse matrix
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641446&r1=641445&r2=641446&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Mar 26 11:03:01 2008
@@ -16,10 +16,15 @@
*/
package org.apache.tika.parser.microsoft;
+import java.awt.Point;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.Comparator;
import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -160,7 +165,16 @@
private boolean insideWorksheet = false;
- private TikaExcelSheet currentSheet;
+ private SortedMap<Point, TikaExcelCell> currentSheet =
+ new TreeMap<Point, TikaExcelCell>(new Comparator<Point> () {
+ public int compare(Point a, Point b) {
+ int diff = a.y - b.y;
+ if (diff == 0) {
+ diff = a.x - b.x;
+ }
+ return diff;
+ }
+ });
/**
* Contstruct a new listener instance outputting parsed data to
@@ -208,11 +222,7 @@
break;
case BOFRecord.TYPE_WORKSHEET:
currentSheetIndex++;
- String currentSheetName = "";
- if (currentSheetIndex < sheetNames.size()) {
- currentSheetName = sheetNames.get(currentSheetIndex);
- }
- currentSheet = new TikaExcelSheet(currentSheetName);
+ currentSheet.clear();
insideWorksheet = true;
break;
}
@@ -220,10 +230,11 @@
/* EOFRecord: indicates end of workbook, worksheet etc. records */
case EOFRecord.sid:
- if (insideWorksheet) {
+ // ignore empty sheets
+ if (insideWorksheet && !currentSheet.isEmpty()) {
processSheet();
- insideWorksheet = false;
}
+ insideWorksheet = false;
break;
/* SSTRecord: holds all the strings for LabelSSTRecords */
@@ -310,7 +321,9 @@
text = text.trim();
}
if (text != null && text.length() > 0) {
- currentSheet.addCell(record.getRow(), record.getColumn(), text);
+ currentSheet.put(
+ new Point(record.getColumn(), record.getRow()),
+ new TikaExcelCell(text));
}
}
@@ -320,58 +333,49 @@
* @throws SAXException if an error occurs
*/
private void processSheet() throws SAXException {
-
- // ignore empty sheets
- if (currentSheet.getCellCount() == 0) {
- return;
- }
-
// Sheet Start
handler.startElement("div", "class", "page");
- handler.element("h1", currentSheet.getName());
+ if (currentSheetIndex < sheetNames.size()) {
+ handler.element("h1", sheetNames.get(currentSheetIndex));
+ }
handler.characters("\n");
handler.startElement("table");
handler.startElement("tbody");
// Process Rows
- int currentRow = 0;
- TikaExcelRow row = currentSheet.getFirstRow();
- while (row != null) {
- while (currentRow < row.getRow()) {
- handler.startElement("tr");
- handler.startElement("td");
+ int currentRow = 1;
+ int currentColumn = 1;
+ handler.startElement("tr");
+ handler.startElement("td");
+ for (Map.Entry<Point, TikaExcelCell> entry : currentSheet.entrySet()) {
+ while (currentRow < entry.getKey().y) {
handler.endElement("td");
handler.endElement("tr");
handler.characters("\n");
+ handler.startElement("tr");
+ handler.startElement("td");
currentRow++;
+ currentColumn = 1;
}
- handler.startElement("tr");
- // Process Cells
- short currentColumn = 0;
- TikaExcelCell cell = row.getFirstCell();
- while (cell != null) {
- while (currentColumn < cell.getColumn()) {
- handler.startElement("td");
- handler.endElement("td");
- handler.characters("\t");
- currentColumn++;
- }
- handler.startElement("td");
- if (cell.getHyperlink() != null) {
- handler.startElement("a", "href", cell.getHyperlink());
- handler.characters(cell.getText());
- handler.endElement("a");
- } else {
- handler.characters(cell.getText());
- }
+ while (currentColumn < entry.getKey().x) {
handler.endElement("td");
- cell = cell.getNextCell();
+ handler.characters("\t");
+ handler.startElement("td");
+ currentColumn++;
}
- handler.endElement("tr");
- row = row.getNextRow();
+ TikaExcelCell cell = entry.getValue();
+ if (cell.getHyperlink() != null) {
+ handler.startElement("a", "href", cell.getHyperlink());
+ handler.characters(cell.getText());
+ handler.endElement("a");
+ } else {
+ handler.characters(cell.getText());
+ }
}
+ handler.endElement("td");
+ handler.endElement("tr");
// Sheet End
handler.endElement("tbody");
@@ -384,197 +388,11 @@
// ======================================================================
/**
- * Tika's excel sheet representation.
- */
- private static class TikaExcelSheet {
- private String name;
- private TikaExcelRow firstRow;
- private TikaExcelRow lastRow;
- private int rowCount;
- private int cellCount;
-
- /**
- * Construct a new sheet instance.
- *
- * @param name The name of the sheet
- */
- TikaExcelSheet(String name) {
- this.name = name;
- }
-
- /**
- * Add a cell to the sheet.
- *
- * @param row The cell's row number
- * @param column The cell's column number
- * @param text The cell's text
- */
- void addCell(int row, short column, String text) {
-
- // Create row if required
- if (lastRow == null || lastRow.row != row) {
- TikaExcelRow newRow = new TikaExcelRow(row);
- rowCount++;
- if (lastRow == null) {
- firstRow = newRow;
- } else {
- lastRow.setNextRow(newRow);
- }
- lastRow = newRow;
- }
-
- cellCount++;
-
- // Add a cell
- lastRow.addCell(new TikaExcelCell(column, text));
- }
-
- /**
- * Find a cell in a sheet.
- *
- * @param row The cell's row number
- * @param column The cell's column number
- * @return The cell or null if not found
- */
- TikaExcelCell findCell(int row, short column) {
-
- TikaExcelRow currentRow = firstRow;
- while (currentRow != null && currentRow.getRow() < row) {
- currentRow = currentRow.getNextRow();
- }
- if (currentRow != null && currentRow.getRow() == row) {
- TikaExcelCell currentCell = currentRow.getFirstCell();
- while (currentCell != null && currentCell.getColumn() < column) {
- currentCell = currentCell.getNextCell();
- }
- if (currentCell != null && currentCell.getColumn() == column) {
- return currentCell;
- }
- }
- return null;
- }
-
- /**
- * Return the number of cells in the sheet.
- *
- * @return the number of cells in the sheet
- */
- int getCellCount() {
- return cellCount;
- }
-
- /**
- * Return the number of rows in the sheet.
- *
- * @return the number of cells in the sheet
- */
- int getRowCount() {
- return rowCount;
- }
-
- /**
- * Return the first row.
- *
- * @return the first row
- */
- TikaExcelRow getFirstRow() {
- return firstRow;
- }
-
- /**
- * Return the name of the sheet.
- *
- * @return the name of the sheet
- */
- String getName() {
- return name;
- }
-
- }
-
- // ======================================================================
-
- /**
- * Tika's excel row representation.
- */
- private static class TikaExcelRow {
- private final int row;
- private TikaExcelRow nextRow;
- private TikaExcelCell firstCell;
- private TikaExcelCell lastCell;
-
- /**
- * Construct a new Row instance.
- *
- * @param row The row number
- */
- TikaExcelRow(int row) {
- this.row = row;
- }
-
- /**
- * Add a cell to the row.
- *
- * @param newCell the new cell to add
- */
- void addCell(TikaExcelCell newCell) {
- if (lastCell != null) {
- lastCell.setNextCell(newCell);
- }
- this.lastCell = newCell;
- if (firstCell == null) {
- firstCell = newCell;
- }
- }
-
- /**
- * Return the first cell in the row.
- *
- * @return the first cell in the row
- */
- TikaExcelCell getFirstCell() {
- return firstCell;
- }
-
- /**
- * Return the row number.
- *
- * @return the row number
- */
- int getRow() {
- return row;
- }
-
- /**
- * Return the next row in the sheet.
- *
- * @return the next row in the sheet
- */
- TikaExcelRow getNextRow() {
- return nextRow;
- }
-
- /**
- * Set the next row in the sheet.
- *
- * @param nextRow the next row in the sheet
- */
- void setNextRow(TikaExcelRow nextRow) {
- this.nextRow = nextRow;
- }
-
- }
-
- // ======================================================================
-
- /**
* Tika's excel cell representation.
*/
private static class TikaExcelCell {
- private final short column;
private String text;
private String hyperlink;
- private TikaExcelCell nextCell;
/**
* Construct a new cell.
@@ -582,30 +400,11 @@
* @param column The cell's column number
* @param text The cell's text
*/
- TikaExcelCell(short column, String text) {
- this.column = column;
+ TikaExcelCell(String text) {
this.text = text;
}
/**
- * Return the cell's column number
- *
- * @return the cell's column number
- */
- short getColumn() {
- return column;
- }
-
- /**
- * Return the next cell in the row.
- *
- * @return the next cell in the row
- */
- TikaExcelCell getNextCell() {
- return nextCell;
- }
-
- /**
* Return the cell's text.
*
* @return the cell's text
@@ -632,14 +431,6 @@
this.hyperlink = hyperlink;
}
- /**
- * Set the next cell in the row.
- *
- * @param nextCell next cell in the row
- */
- void setNextCell(TikaExcelCell nextCell) {
- this.nextCell = nextCell;
- }
-
}
+
}