You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/26 19:03:11 UTC

svn commit: r641446 - /incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java

Author: jukka
Date: Wed Mar 26 11:03:01 2008
New Revision: 641446

URL: http://svn.apache.org/viewvc?rev=641446&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
    - Use a TreeMap instead of custom linked lists for the sparse matrix

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641446&r1=641445&r2=641446&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Mar 26 11:03:01 2008
@@ -16,10 +16,15 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.awt.Point;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -160,7 +165,16 @@
 
         private boolean insideWorksheet = false;
 
-        private TikaExcelSheet currentSheet;
+        private SortedMap<Point, TikaExcelCell> currentSheet =
+            new TreeMap<Point, TikaExcelCell>(new Comparator<Point> () {
+                public int compare(Point a, Point b) {
+                    int diff = a.y - b.y;
+                    if (diff == 0) {
+                        diff = a.x - b.x;
+                    }
+                    return diff;
+                }
+            });
 
         /**
          * Contstruct a new listener instance outputting parsed data to
@@ -208,11 +222,7 @@
                             break;
                         case BOFRecord.TYPE_WORKSHEET:
                             currentSheetIndex++;
-                            String currentSheetName = "";
-                            if (currentSheetIndex < sheetNames.size()) {
-                                currentSheetName = sheetNames.get(currentSheetIndex);
-                            }
-                            currentSheet = new TikaExcelSheet(currentSheetName);
+                            currentSheet.clear();
                             insideWorksheet = true;
                             break;
                     }
@@ -220,10 +230,11 @@
 
                 /* EOFRecord: indicates end of workbook, worksheet etc. records */
                 case EOFRecord.sid:
-                    if (insideWorksheet) {
+                    // ignore empty sheets
+                    if (insideWorksheet && !currentSheet.isEmpty()) {
                         processSheet();
-                        insideWorksheet = false;
                     }
+                    insideWorksheet = false;
                     break;
 
                 /* SSTRecord: holds all the strings for LabelSSTRecords */
@@ -310,7 +321,9 @@
                 text = text.trim();
             }
             if (text != null && text.length() > 0) {
-                currentSheet.addCell(record.getRow(), record.getColumn(), text);
+                currentSheet.put(
+                        new Point(record.getColumn(), record.getRow()),
+                        new TikaExcelCell(text));
             }
         }
 
@@ -320,58 +333,49 @@
          * @throws SAXException if an error occurs
          */
         private void processSheet() throws SAXException {
-
-            // ignore empty sheets
-            if (currentSheet.getCellCount() == 0) {
-                return;
-            }
-            
             // Sheet Start
             handler.startElement("div", "class", "page");
-            handler.element("h1", currentSheet.getName());
+            if (currentSheetIndex < sheetNames.size()) {
+                handler.element("h1", sheetNames.get(currentSheetIndex));
+            }
             handler.characters("\n");
             handler.startElement("table");
             handler.startElement("tbody");
 
             // Process Rows
-            int currentRow = 0;
-            TikaExcelRow row = currentSheet.getFirstRow();
-            while (row != null) {
-                while (currentRow < row.getRow()) {
-                    handler.startElement("tr");
-                    handler.startElement("td");
+            int currentRow = 1;
+            int currentColumn = 1;
+            handler.startElement("tr");
+            handler.startElement("td");
+            for (Map.Entry<Point, TikaExcelCell> entry : currentSheet.entrySet()) {
+                while (currentRow < entry.getKey().y) {
                     handler.endElement("td");
                     handler.endElement("tr");
                     handler.characters("\n");
+                    handler.startElement("tr");
+                    handler.startElement("td");
                     currentRow++;
+                    currentColumn = 1;
                 }
-                handler.startElement("tr");
 
-                // Process Cells
-                short currentColumn = 0;
-                TikaExcelCell cell = row.getFirstCell();
-                while (cell != null) {
-                    while (currentColumn < cell.getColumn()) {
-                        handler.startElement("td");
-                        handler.endElement("td");
-                        handler.characters("\t");
-                        currentColumn++;
-                    }
-                    handler.startElement("td");
-                    if (cell.getHyperlink() != null) {
-                        handler.startElement("a", "href", cell.getHyperlink());
-                        handler.characters(cell.getText());
-                        handler.endElement("a");
-                    } else {
-                        handler.characters(cell.getText());
-                    }
+                while (currentColumn < entry.getKey().x) {
                     handler.endElement("td");
-                    cell = cell.getNextCell();
+                    handler.characters("\t");
+                    handler.startElement("td");
+                    currentColumn++;
                 }
 
-                handler.endElement("tr");
-                row = row.getNextRow();
+                TikaExcelCell cell = entry.getValue();
+                if (cell.getHyperlink() != null) {
+                    handler.startElement("a", "href", cell.getHyperlink());
+                    handler.characters(cell.getText());
+                    handler.endElement("a");
+                } else {
+                    handler.characters(cell.getText());
+                }
             }
+            handler.endElement("td");
+            handler.endElement("tr");
             
             // Sheet End
             handler.endElement("tbody");
@@ -384,197 +388,11 @@
     // ======================================================================
 
     /**
-     * Tika's excel sheet representation.
-     */
-    private static class TikaExcelSheet {
-        private String name;
-        private TikaExcelRow firstRow;
-        private TikaExcelRow lastRow;
-        private int rowCount;
-        private int cellCount;
-
-        /**
-         * Construct a new sheet instance.
-         *
-         * @param name The name of the sheet
-         */
-        TikaExcelSheet(String name) {
-            this.name = name;
-        }
-
-        /**
-         * Add a cell to the sheet.
-         *
-         * @param row The cell's row number
-         * @param column The cell's column number
-         * @param text The cell's text
-         */
-        void addCell(int row, short column, String text) {
-
-            // Create row if required
-            if (lastRow == null || lastRow.row != row) {
-                TikaExcelRow newRow = new TikaExcelRow(row);
-                rowCount++;
-                if (lastRow == null) {
-                    firstRow = newRow;
-                } else {
-                    lastRow.setNextRow(newRow);
-                }
-                lastRow = newRow;
-            }
-
-            cellCount++;
-
-            // Add a cell
-            lastRow.addCell(new TikaExcelCell(column, text));
-        }
-
-        /**
-         * Find a cell in a sheet.
-         *
-         * @param row The cell's row number
-         * @param column The cell's column number
-         * @return The cell or null if not found
-         */
-        TikaExcelCell findCell(int row, short column) {
-
-            TikaExcelRow currentRow = firstRow;
-            while (currentRow != null && currentRow.getRow() < row) {
-                currentRow = currentRow.getNextRow();
-            }
-            if (currentRow != null && currentRow.getRow() == row) {
-                TikaExcelCell currentCell = currentRow.getFirstCell();
-                while (currentCell != null && currentCell.getColumn() < column) {
-                    currentCell = currentCell.getNextCell();
-                }
-                if (currentCell != null && currentCell.getColumn() == column) {
-                    return currentCell;
-                }
-            }
-            return null;
-        }
-
-        /**
-         * Return the number of cells in the sheet.
-         *
-         * @return the number of cells in the sheet
-         */
-        int getCellCount() {
-            return cellCount;
-        }
-
-        /**
-         * Return the number of rows in the sheet.
-         *
-         * @return the number of cells in the sheet
-         */
-        int getRowCount() {
-            return rowCount;
-        }
-
-        /**
-         * Return the first row.
-         *
-         * @return the first row
-         */
-        TikaExcelRow getFirstRow() {
-            return firstRow;
-        }
-
-        /**
-         * Return the name of the sheet.
-         *
-         * @return the name of the sheet
-         */
-        String getName() {
-            return name;
-        }
-        
-    }
-
-    // ======================================================================
-
-    /**
-     * Tika's excel row representation. 
-     */
-    private static class TikaExcelRow {
-        private final int row;
-        private TikaExcelRow nextRow;
-        private TikaExcelCell firstCell;
-        private TikaExcelCell lastCell;
-
-        /**
-         * Construct a new Row instance.
-         *
-         * @param row The row number
-         */
-        TikaExcelRow(int row) {
-            this.row = row;
-        }
-
-        /**
-         * Add a cell to the row.
-         *
-         * @param newCell the new cell to add
-         */
-        void addCell(TikaExcelCell newCell) {
-            if (lastCell != null) {
-                lastCell.setNextCell(newCell);
-            }
-            this.lastCell = newCell;
-            if (firstCell == null) {
-                firstCell = newCell;
-            }
-        }
-
-        /**
-         * Return the first cell in the row.
-         *
-         * @return the first cell in the row
-         */
-        TikaExcelCell getFirstCell() {
-            return firstCell;
-        }
-
-        /**
-         * Return the row number.
-         *
-         * @return the row number
-         */
-        int getRow() {
-            return row;
-        }
-        
-        /**
-         * Return the next row in the sheet.
-         *
-         * @return the next row in the sheet
-         */
-        TikaExcelRow getNextRow() {
-            return nextRow;
-        }
-
-        /**
-         * Set the next row in the sheet.
-         *
-         * @param nextRow the next row in the sheet
-         */
-        void setNextRow(TikaExcelRow nextRow) {
-            this.nextRow = nextRow;
-        }
-        
-    }
-
-    // ======================================================================
-
-    /**
      * Tika's excel cell representation. 
      */
     private static class TikaExcelCell {
-        private final short column;
         private String text;
         private String hyperlink;
-        private TikaExcelCell nextCell;
 
         /**
          * Construct a new cell.
@@ -582,30 +400,11 @@
          * @param column The cell's column number
          * @param text The cell's text
          */
-        TikaExcelCell(short column, String text) {
-            this.column = column;
+        TikaExcelCell(String text) {
             this.text = text;
         }
 
         /**
-         * Return the cell's column number
-         *
-         * @return the cell's column number
-         */
-        short getColumn() {
-            return column;
-        }
-
-        /**
-         * Return the next cell in the row.
-         *
-         * @return the next cell in the row
-         */
-        TikaExcelCell getNextCell() {
-            return nextCell;
-        }
-
-        /**
          * Return the cell's text.
          *
          * @return the cell's text
@@ -632,14 +431,6 @@
             this.hyperlink = hyperlink;
         }
 
-        /**
-         * Set the next cell in the row.
-         *
-         * @param nextCell next cell in the row
-         */
-        void setNextCell(TikaExcelCell nextCell) {
-            this.nextCell = nextCell;
-        }
-        
     }
+
 }