You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/26 22:04:45 UTC
svn commit: r641575 -
/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Author: jukka
Date: Wed Mar 26 14:04:41 2008
New Revision: 641575
URL: http://svn.apache.org/viewvc?rev=641575&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
- Removed the insideWorksheet flag
- Improved javadocs
- Extracted PointComparator to an explicit utility class
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641575&r1=641574&r2=641575&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Mar 26 14:04:41 2008
@@ -170,21 +170,23 @@
private SAXException exception = null;
private SSTRecord sstRecord;
+
+ /**
+ * List of worksheet names.
+ */
private List<String> sheetNames = new ArrayList<String>();
- private short currentSheetIndex;
- private boolean insideWorksheet = false;
+ /**
+ * Index of the current worksheet within the workbook.
+ * Used to find the worksheet name in the {@link #sheetNames} list.
+ */
+ private short currentSheetIndex;
- private SortedMap<Point, Cell> currentSheet =
- new TreeMap<Point, Cell>(new Comparator<Point> () {
- public int compare(Point a, Point b) {
- int diff = a.y - b.y;
- if (diff == 0) {
- diff = a.x - b.x;
- }
- return diff;
- }
- });
+ /**
+ * Content of the current worksheet, or <code>null</code> if no
+ * worksheet is currently active.
+ */
+ private SortedMap<Point, Cell> currentSheet = null;
/**
* Contstruct a new listener instance outputting parsed data to
@@ -228,16 +230,16 @@
currentSheetIndex = -1;
} else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
currentSheetIndex++;
- currentSheet.clear();
- insideWorksheet = true;
+ currentSheet =
+ new TreeMap<Point, Cell>(new PointComparator());
}
break;
case EOFRecord.sid: // end of workbook, worksheet etc. records
- if (insideWorksheet && !currentSheet.isEmpty()) {
+ if (currentSheet != null && !currentSheet.isEmpty()) {
processSheet();
}
- insideWorksheet = false;
+ currentSheet = null;
break;
case BoundSheetRecord.sid: // Worksheet index record
@@ -277,12 +279,14 @@
// FIXME - requires POI release
// case HyperlinkRecord.sid: // holds a URL associated with a cell
- // HyperlinkRecord link = (HyperlinkRecord) record;
- // Point point =
- // new Point(link.getFirstColumn(), link.getFirstRow());
- // Cell cell = currentSheet.get(point);
- // if (cell != null) {
- // addCell(record, new LinkedCell(cell, link.getAddress()));
+ // if (currentSheet != null) {
+ // HyperlinkRecord link = (HyperlinkRecord) record;
+ // Point point =
+ // new Point(link.getFirstColumn(), link.getFirstRow());
+ // Cell cell = currentSheet.get(point);
+ // if (cell != null) {
+ // addCell(record, new LinkedCell(cell, link.getAddress()));
+ // }
// }
// break;
}
@@ -296,7 +300,7 @@
* @param cell cell value (or <code>null</code>)
*/
private void addCell(Record record, Cell cell) {
- if (!insideWorksheet) {
+ if (currentSheet == null) {
// Ignore cells outside sheets
} else if (cell == null) {
// Ignore empty cells
@@ -373,6 +377,21 @@
handler.endElement("div");
handler.characters("\n");
}
+ }
+
+ /**
+ * Utility comparator for points.
+ */
+ private static class PointComparator implements Comparator<Point> {
+
+ public int compare(Point a, Point b) {
+ int diff = a.y - b.y;
+ if (diff == 0) {
+ diff = a.x - b.x;
+ }
+ return diff;
+ }
+
}
}