You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/26 17:47:39 UTC

svn commit: r641394 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java

Author: jukka
Date: Wed Mar 26 09:47:25 2008
New Revision: 641394

URL: http://svn.apache.org/viewvc?rev=641394&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
    - Patch by Niall Pemberton

Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=641394&r1=641393&r2=641394&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Wed Mar 26 09:47:25 2008
@@ -35,6 +35,9 @@
 
 15. TIKA-133 - TeeContentHandler constructor should use varargs (Jukka Zitting)
 
+15. TIKA-132 - Refactor Excel extractor to parse per sheet and add
+               hyperlink support (Niall Pemberton)
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641394&r1=641393&r2=641394&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Mar 26 09:47:25 2008
@@ -35,6 +35,7 @@
 import org.apache.poi.hssf.record.ExtendedFormatRecord;
 import org.apache.poi.hssf.record.FormatRecord;
 import org.apache.poi.hssf.record.FormulaRecord;
+//import org.apache.poi.hssf.record.HyperlinkRecord;  // FIXME - requires POI release
 import org.apache.poi.hssf.record.LabelRecord;
 import org.apache.poi.hssf.record.LabelSSTRecord;
 import org.apache.poi.hssf.record.NumberRecord;
@@ -131,6 +132,7 @@
             hssfRequest.addListener(listener, LabelSSTRecord.sid);
             hssfRequest.addListener(listener, NumberRecord.sid);
             hssfRequest.addListener(listener, RKRecord.sid);
+            //hssfRequest.addListener(listener, HyperlinkRecord.sid); // FIXME - requires POI release
         }
 
         // Create event factory and process Workbook (fire events)
@@ -158,9 +160,7 @@
 
         private boolean insideWorksheet = false;
 
-        private int currentRow;
-
-        private short currentColumn;
+        private TikaExcelSheet currentSheet;
 
         /**
          * Contstruct a new listener instance outputting parsed data to
@@ -212,16 +212,8 @@
                             if (currentSheetIndex < sheetNames.size()) {
                                 currentSheetName = sheetNames.get(currentSheetIndex);
                             }
-                            handler.startElement("div", "class", "page");
-                            handler.element("h1", currentSheetName);
-                            handler.characters("\n");
-                            handler.startElement("table");
-                            handler.startElement("tbody");
-                            handler.startElement("tr");
-                            handler.startElement("td");
+                            currentSheet = new TikaExcelSheet(currentSheetName);
                             insideWorksheet = true;
-                            currentRow = 0;
-                            currentColumn = 0;
                             break;
                     }
                     break;
@@ -229,12 +221,7 @@
                 /* EOFRecord: indicates end of workbook, worksheet etc. records */
                 case EOFRecord.sid:
                     if (insideWorksheet) {
-                        handler.endElement("td");
-                        handler.endElement("tr");
-                        handler.endElement("tbody");
-                        handler.endElement("table");
-                        handler.endElement("div");
-                        handler.characters("\n");
+                        processSheet();
                         insideWorksheet = false;
                     }
                     break;
@@ -251,6 +238,20 @@
                     sheetNames.add(sheetName);
                     break;
 
+                // FIXME - requires POI release
+                ///* HyperlinkRecord: holds a URL associated with a cell */
+                //case HyperlinkRecord.sid:
+                //    HyperlinkRecord hyperlinkRecord = (HyperlinkRecord)record;
+                //    if (insideWorksheet) {
+                //        int row = hyperlinkRecord.getFirstRow();
+                //        short column =  hyperlinkRecord.getFirstColumn();
+                //        TikaExcelCell cell = currentSheet.findCell(row, column);
+                //        if (cell != null) {
+                //            cell.setHyperlink(hyperlinkRecord.getAddress());
+                //        }
+                //    }
+                //    break;
+
                 default:
                     if (insideWorksheet
                             && record instanceof CellValueRecordInterface) {
@@ -271,71 +272,374 @@
         private void processCellValue(
                 short sid, CellValueRecordInterface record)
                 throws SAXException {
-            while (currentRow < record.getRow()) {
-                handler.endElement("td");
-                handler.endElement("tr");
-                handler.characters("\n");
-                handler.startElement("tr");
-                handler.startElement("td");
-                currentRow++;
-                currentColumn = 0;
-            }
-            while (currentColumn < record.getColumn()) {
-                handler.endElement("td");
-                handler.characters("\t");
-                handler.startElement("td");
-                currentColumn++;
-            }
 
+            String text = null;
             switch (sid) {
                 /* FormulaRecord: Cell value from a formula */
                 case FormulaRecord.sid:
                     FormulaRecord formulaRecord = (FormulaRecord)record;
                     double fmlValue = formulaRecord.getValue();
-                    addText(Double.toString(fmlValue));
+                    text = Double.toString(fmlValue);
                     break;
 
                 /* LabelRecord: strings stored directly in the cell */
                 case LabelRecord.sid:
-                    addText(((LabelRecord) record).getValue());
+                    text = ((LabelRecord)record).getValue();
                     break;
 
                 /* LabelSSTRecord: Ref. a string in the shared string table */
                 case LabelSSTRecord.sid:
                     LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record;
                     int sstIndex = labelSSTRecord.getSSTIndex();
-                    addText(sstRecord.getString(sstIndex).getString());
+                    text = sstRecord.getString(sstIndex).getString();
                     break;
 
                 /* NumberRecord: Contains a numeric cell value */
                 case NumberRecord.sid:
                     double numValue = ((NumberRecord)record).getValue();
-                    addText(Double.toString(numValue));
+                    text = Double.toString(numValue);
                     break;
 
                 /* RKRecord: Excel internal number record */
                 case RKRecord.sid:
                     double rkValue = ((RKRecord)record).getRKNumber();
-                    addText(Double.toString(rkValue));
+                    text = Double.toString(rkValue);
                     break;
             }
+            if (text != null) {
+                text = text.trim();
+            }
+            if (text != null && text.length() > 0) {
+                currentSheet.addCell(record.getRow(), record.getColumn(), text);
+            }
         }
 
         /**
-         * Add a parsed text value to this listners appendable.
-         * <p>
-         * Null and zero length values are ignored.
+         * Process an excel sheet.
          *
-         * @param text The text value
+         * @throws SAXException if an error occurs
          */
-        private void addText(String text) throws SAXException {
-            if (text != null) {
-                text = text.trim();
-                if (text.length() > 0) {
-                    handler.characters(text);
+        private void processSheet() throws SAXException {
+
+            // ignore empty sheets
+            if (currentSheet.getCellCount() == 0) {
+                return;
+            }
+            
+            // Sheet Start
+            handler.startElement("div", "class", "page");
+            handler.element("h1", currentSheet.getName());
+            handler.characters("\n");
+            handler.startElement("table");
+            handler.startElement("tbody");
+
+            // Process Rows
+            int currentRow = 0;
+            TikaExcelRow row = currentSheet.getFirstRow();
+            while (row != null) {
+                while (currentRow < row.getRow()) {
+                    handler.startElement("tr");
+                    handler.startElement("td");
+                    handler.endElement("td");
+                    handler.endElement("tr");
+                    handler.characters("\n");
+                    currentRow++;
+                }
+                handler.startElement("tr");
+
+                // Process Cells
+                short currentColumn = 0;
+                TikaExcelCell cell = row.getFirstCell();
+                while (cell != null) {
+                    while (currentColumn < cell.getColumn()) {
+                        handler.startElement("td");
+                        handler.endElement("td");
+                        handler.characters("\t");
+                        currentColumn++;
+                    }
+                    handler.startElement("td");
+                    if (cell.getHyperlink() != null) {
+                        handler.startElement("a", "href", cell.getHyperlink());
+                        handler.characters(cell.getText());
+                        handler.endElement("a");
+                    } else {
+                        handler.characters(cell.getText());
+                    }
+                    handler.endElement("td");
+                    cell = cell.getNextCell();
                 }
+
+                handler.endElement("tr");
+                row = row.getNextRow();
+            }
+            
+            // Sheet End
+            handler.endElement("tbody");
+            handler.endElement("table");
+            handler.endElement("div");
+            handler.characters("\n");
+        }
+    }
+
+    // ======================================================================
+
+    /**
+     * Tika's excel sheet representation.
+     */
+    private static class TikaExcelSheet {
+        private String name;
+        private TikaExcelRow firstRow;
+        private TikaExcelRow lastRow;
+        private int rowCount;
+        private int cellCount;
+
+        /**
+         * Construct a new sheet instance.
+         *
+         * @param name The name of the sheet
+         */
+        TikaExcelSheet(String name) {
+            this.name = name;
+        }
+
+        /**
+         * Add a cell to the sheet.
+         *
+         * @param row The cell's row number
+         * @param column The cell's column number
+         * @param text The cell's text
+         */
+        void addCell(int row, short column, String text) {
+
+            // Create row if required
+            if (lastRow == null || lastRow.row != row) {
+                TikaExcelRow newRow = new TikaExcelRow(row);
+                rowCount++;
+                if (lastRow == null) {
+                    firstRow = newRow;
+                } else {
+                    lastRow.setNextRow(newRow);
+                }
+                lastRow = newRow;
+            }
+
+            cellCount++;
+
+            // Add a cell
+            lastRow.addCell(new TikaExcelCell(column, text));
+        }
+
+        /**
+         * Find a cell in a sheet.
+         *
+         * @param row The cell's row number
+         * @param column The cell's column number
+         * @return The cell or null if not found
+         */
+        TikaExcelCell findCell(int row, short column) {
+
+            TikaExcelRow currentRow = firstRow;
+            while (currentRow != null && currentRow.getRow() < row) {
+                currentRow = currentRow.getNextRow();
+            }
+            if (currentRow != null && currentRow.getRow() == row) {
+                TikaExcelCell currentCell = currentRow.getFirstCell();
+                while (currentCell != null && currentCell.getColumn() < column) {
+                    currentCell = currentCell.getNextCell();
+                }
+                if (currentCell != null && currentCell.getColumn() == column) {
+                    return currentCell;
+                }
+            }
+            return null;
+        }
+
+        /**
+         * Return the number of cells in the sheet.
+         *
+         * @return the number of cells in the sheet
+         */
+        int getCellCount() {
+            return cellCount;
+        }
+
+        /**
+         * Return the number of rows in the sheet.
+         *
+         * @return the number of cells in the sheet
+         */
+        int getRowCount() {
+            return rowCount;
+        }
+
+        /**
+         * Return the first row.
+         *
+         * @return the first row
+         */
+        TikaExcelRow getFirstRow() {
+            return firstRow;
+        }
+
+        /**
+         * Return the name of the sheet.
+         *
+         * @return the name of the sheet
+         */
+        String getName() {
+            return name;
+        }
+        
+    }
+
+    // ======================================================================
+
+    /**
+     * Tika's excel row representation. 
+     */
+    private static class TikaExcelRow {
+        private final int row;
+        private TikaExcelRow nextRow;
+        private TikaExcelCell firstCell;
+        private TikaExcelCell lastCell;
+
+        /**
+         * Construct a new Row instance.
+         *
+         * @param row The row number
+         */
+        TikaExcelRow(int row) {
+            this.row = row;
+        }
+
+        /**
+         * Add a cell to the row.
+         *
+         * @param newCell the new cell to add
+         */
+        void addCell(TikaExcelCell newCell) {
+            if (lastCell != null) {
+                lastCell.setNextCell(newCell);
             }
+            this.lastCell = newCell;
+            if (firstCell == null) {
+                firstCell = newCell;
+            }
+        }
+
+        /**
+         * Return the first cell in the row.
+         *
+         * @return the first cell in the row
+         */
+        TikaExcelCell getFirstCell() {
+            return firstCell;
+        }
+
+        /**
+         * Return the row number.
+         *
+         * @return the row number
+         */
+        int getRow() {
+            return row;
         }
+        
+        /**
+         * Return the next row in the sheet.
+         *
+         * @return the next row in the sheet
+         */
+        TikaExcelRow getNextRow() {
+            return nextRow;
+        }
+
+        /**
+         * Set the next row in the sheet.
+         *
+         * @param nextRow the next row in the sheet
+         */
+        void setNextRow(TikaExcelRow nextRow) {
+            this.nextRow = nextRow;
+        }
+        
+    }
+
+    // ======================================================================
 
+    /**
+     * Tika's excel cell representation. 
+     */
+    private static class TikaExcelCell {
+        private final short column;
+        private String text;
+        private String hyperlink;
+        private TikaExcelCell nextCell;
+
+        /**
+         * Construct a new cell.
+         *
+         * @param column The cell's column number
+         * @param text The cell's text
+         */
+        TikaExcelCell(short column, String text) {
+            this.column = column;
+            this.text = text;
+        }
+
+        /**
+         * Return the cell's column number
+         *
+         * @return the cell's column number
+         */
+        short getColumn() {
+            return column;
+        }
+
+        /**
+         * Return the next cell in the row.
+         *
+         * @return the next cell in the row
+         */
+        TikaExcelCell getNextCell() {
+            return nextCell;
+        }
+
+        /**
+         * Return the cell's text.
+         *
+         * @return the cell's text
+         */
+        String getText() {
+            return text;
+        }
+
+        /**
+         * Return hyperlink address, if any
+         *
+         * @return the hyperlink address
+         */
+        String getHyperlink() {
+            return hyperlink;
+        }
+
+        /**
+         * Set the hyperlink address
+         *
+         * @param hyperlink the hyperlink address to set
+         */
+        void setHyperlink(String hyperlink) {
+            this.hyperlink = hyperlink;
+        }
+
+        /**
+         * Set the next cell in the row.
+         *
+         * @param nextCell next cell in the row
+         */
+        void setNextCell(TikaExcelCell nextCell) {
+            this.nextCell = nextCell;
+        }
+        
     }
 }