You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/12/21 10:17:15 UTC

svn commit: r606141 - /incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java

Author: jukka
Date: Fri Dec 21 01:17:15 2007
New Revision: 606141

URL: http://svn.apache.org/viewvc?rev=606141&view=rev
Log:
TIKA-105 - Excel parser implementation based on POI's Event API
    - New class contributed by Niall Pemberton

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java?rev=606141&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java Fri Dec 21 01:17:15 2007
@@ -0,0 +1,486 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.CountryRecord;
+import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.EOFRecord;
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FormatRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.RKRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.UnicodeString;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * Excel parser implementation which uses POI's Event API
+ * to handle the contents of a Workbook.
+ * <p>
+ * This is an alternative implementation to Tika's
+ * {@link ExcelParser} implementation which uses POI's
+ * <code>HSSFWorkbook</code> to parse excel files.
+ * <p>
+ * The Event API uses a much smaller memory footprint than
+ * <code>HSSFWorkbook</code> when processing excel files
+ * but at the cost of more complexity.
+ * <p>
+ * With the Event API a <i>listener</i> is registered for
+ * specific record types and those records are created,
+ * fired off to the listener and then discarded as the stream
+ * is being processed.
+ *
+ * @see org.apache.poi.hssf.eventusermodel.HSSFListener
+ * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
+ * POI Event API How To</a>
+ */
+public class ExcelEventParser extends OfficeParser implements Serializable {
+
+    /** Logging instance */
+    private static Log log = LogFactory.getLog(ExcelEventParser.class);
+
+    /**
+     * <code>true</code> if the HSSFListener should be registered
+     * to listen for all records or <code>false</code> if the listener
+     * should be configured to only receive specified records.
+     */
+    private final boolean listenForAllRecords;
+
+    /**
+     * Create an instance which only listens for the specified
+     * records (i.e. <code>listenForAllRecords</code> is
+     * <code>false</code>).
+     */
+    public ExcelEventParser() {
+        this(false);
+    }
+
+    /**
+     * Create an instance specifying whether to listen for all
+     * records or just for the specified few.
+     * <p>
+     * <strong>Note</strong> This constructor is intended primarily
+     * for testing and debugging - under normal operation
+     * <code>listenForAllRecords</code> should be <code>false</code>.
+     *
+     * @param listenForAllRecords <code>true</code> if the HSSFListener
+     * should be registered to listen for all records or <code>false</code>
+     * if the listener should be configured to only receive specified records.
+     */
+    public ExcelEventParser(boolean listenForAllRecords) {
+        this.listenForAllRecords = listenForAllRecords;
+    }
+
+    /**
+     * Return the content type handled by this parser.
+     *
+     * @return The content type handled
+     */
+    protected String getContentType() {
+        return "application/vnd.ms-excel";
+    }
+
+    /**
+     * Extracts text from an Excel Workbook writing the extracted content
+     * to the specified {@link Appendable}.
+     *
+     * @param filesystem POI file system
+     * @param appendable Where to output the parsed contents
+     * @throws IOException if an error occurs processing the workbook
+     * or writing the extracted content
+     */
+    protected void extractText(final POIFSFileSystem filesystem,
+            final Appendable appendable) throws IOException {
+
+        if (log.isInfoEnabled()) {
+            log.info("Starting listenForAllRecords=" + listenForAllRecords);
+        }
+
+        // Set up listener and register the records we want to process
+        TikaHSSFListener listener = new TikaHSSFListener(appendable);
+        HSSFRequest hssfRequest = new HSSFRequest();
+        if (listenForAllRecords) {
+            hssfRequest.addListenerForAllRecords(listener);
+        } else {
+            hssfRequest.addListener(listener, BOFRecord.sid);
+            hssfRequest.addListener(listener, EOFRecord.sid);
+            hssfRequest.addListener(listener, DateWindow1904Record.sid);
+            hssfRequest.addListener(listener, CountryRecord.sid);
+            hssfRequest.addListener(listener, BoundSheetRecord.sid);
+            hssfRequest.addListener(listener, FormatRecord.sid);
+            hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
+            hssfRequest.addListener(listener, SSTRecord.sid);
+            hssfRequest.addListener(listener, FormulaRecord.sid);
+            hssfRequest.addListener(listener, LabelRecord.sid);
+            hssfRequest.addListener(listener, LabelSSTRecord.sid);
+            hssfRequest.addListener(listener, NumberRecord.sid);
+            hssfRequest.addListener(listener, RKRecord.sid);
+        }
+
+        // Create event factory and process Workbook (fire events)
+        DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+        HSSFEventFactory eventFactory = new HSSFEventFactory();
+        eventFactory.processEvents(hssfRequest, documentInputStream);
+
+        if (log.isInfoEnabled()) {
+            log.info("Processed " + listener.getRecordCount() + " records");
+        }
+    }
+
+    // ======================================================================
+
+    /**
+     * HSSF Listener implementation which processes the HSSF records.
+     */
+    private static class TikaHSSFListener implements HSSFListener, Serializable {
+
+        /** Logging instance */
+        private static Log log = LogFactory.getLog(ExcelEventParser.class);
+
+        private final Appendable appendable;
+        private int recordCount;
+        private SSTRecord sstRecord;
+        private Map<Short, String> formats        = new HashMap<Short, String>();
+        private Map<Short, Short> extendedFormats = new HashMap<Short, Short>();
+        private List<String> sheetNames = new ArrayList<String>();
+        private short bofRecordType;
+        private short defualtCountry;
+        private short currentCountry;
+        private short currentXFormatIdx;
+        private short currentSheetIndex;
+        private String currentSheetName;
+        private boolean firstElement = true;
+        private boolean use1904windowing = false;
+
+        /**
+         * Contstruct a new listener instance outputting parsed data to
+         * the specified Appendable.
+         *
+         * @param appendable Destination to write the parsed output to
+         */
+        private TikaHSSFListener(final Appendable appendable) {
+            this.appendable = appendable;
+        }
+
+        /**
+         * Return a count of the number of records processed.
+         *
+         * @return The number of records processed by this listener
+         */
+        private int getRecordCount() {
+            return recordCount;
+        }
+
+        /**
+         * Process a HSSF record.
+         *
+         * @param record HSSF Record
+         */
+        public void processRecord(final Record record) {
+            recordCount++;
+            final short sid = record.getSid();
+            switch (sid) {
+
+                /* BOFRecord: indicates start of workbook, worksheet etc. records */
+                case BOFRecord.sid:
+                    BOFRecord bofRecord = (BOFRecord)record;
+                    bofRecordType = bofRecord.getType();
+                    switch (bofRecordType) {
+                        case BOFRecord.TYPE_WORKBOOK:
+                            currentSheetIndex = -1;
+                            debug(record, ".Workbook");
+                            break;
+                        case BOFRecord.TYPE_WORKSHEET:
+                            currentSheetIndex++;
+                            currentSheetName = null;
+                            if (currentSheetIndex < sheetNames.size()) {
+                                currentSheetName = sheetNames.get(currentSheetIndex);
+                            }
+                            if (log.isDebugEnabled()) {
+                                debug(record, ".Worksheet[" + currentSheetIndex
+                                        + "], Name=[" + currentSheetName + "]");
+                            }
+                            addText(currentSheetName);
+                            break;
+                        default:
+                            if (log.isDebugEnabled()) {
+                                debug(record, "[" + bofRecordType + "]");
+                            }
+                            break;
+                    }
+                    break;
+
+                /* BOFRecord: indicates end of workbook, worksheet etc. records */
+                case EOFRecord.sid:
+                    debug(record);
+                    bofRecordType = 0;
+                    break;
+
+                /* Indicates whether to use 1904 Date Windowing or not */
+                case DateWindow1904Record.sid:
+                    DateWindow1904Record dw1904Rec = (DateWindow1904Record)record;
+                    use1904windowing = (dw1904Rec.getWindowing() == 1);
+                    if (log.isDebugEnabled()) {
+                        debug(record, "[" + use1904windowing + "]");
+                    }
+                    break;
+
+                /* CountryRecord: holds all the strings for LabelSSTRecords */
+                case CountryRecord.sid:
+                    CountryRecord countryRecord = (CountryRecord)record;
+                    defualtCountry = countryRecord.getDefaultCountry();
+                    currentCountry = countryRecord.getCurrentCountry();
+                    if (log.isDebugEnabled()) {
+                        debug(record, " default=[" + defualtCountry
+                                + "], current=[" + currentCountry + "]");
+                    }
+                    break;
+
+                /* SSTRecord: holds all the strings for LabelSSTRecords */
+                case SSTRecord.sid:
+                    sstRecord = (SSTRecord)record;
+                    debug(record);
+                    break;
+
+                /* BoundSheetRecord: Worksheet index record */
+                case BoundSheetRecord.sid:
+                    BoundSheetRecord boundSheetRecord = (BoundSheetRecord)record;
+                    String sheetName = boundSheetRecord.getSheetname();
+                    sheetNames.add(sheetName);
+                    if (log.isDebugEnabled()) {
+                        debug(record, "[" + sheetNames.size()
+                                + "], Name=[" + sheetName + "]");
+                    }
+                    break;
+
+                /* FormatRecord */
+                case FormatRecord.sid:
+                    FormatRecord formatRecord = (FormatRecord)record;
+                    String dataFormat = formatRecord.getFormatString();
+                    short formatIdx = formatRecord.getIndexCode();
+                    formats.put(formatIdx, dataFormat);
+                    if (log.isDebugEnabled()) {
+                        debug(record, "[" + formatIdx + "]=[" + dataFormat + "]");
+                    }
+                    break;
+
+                /* ExtendedFormatRecord */
+                case ExtendedFormatRecord.sid:
+                    ExtendedFormatRecord xFormatRecord = (ExtendedFormatRecord)record;
+                    if (xFormatRecord.getXFType() == ExtendedFormatRecord.XF_CELL) {
+                        short dataFormatIdx = xFormatRecord.getFormatIndex();
+                        if (dataFormatIdx > 0) {
+                            extendedFormats.put(currentXFormatIdx, dataFormatIdx);
+                            if (log.isDebugEnabled()) {
+                                debug(record, "[" + currentXFormatIdx
+                                        + "]=FormatRecord[" + dataFormatIdx + "]");
+                            }
+                        }
+                    }
+                    currentXFormatIdx++;
+                    break;
+
+                default:
+                    if (bofRecordType == BOFRecord.TYPE_WORKSHEET
+                            && record instanceof CellValueRecordInterface) {
+                        processCellValue(sid, (CellValueRecordInterface)record);
+                    } else {
+                        debug(record);
+                    }
+                    break;
+            }
+        }
+
+        /**
+         * Process a Cell Value record.
+         *
+         * @param sid record type identifier
+         * @param record The cell value record
+         */
+        private void processCellValue(final short sid,
+                final CellValueRecordInterface record) {
+
+            short xfIdx = record.getXFIndex();
+            Short dfIdx = extendedFormats.get(xfIdx);
+            String dataFormat = dfIdx != null ? formats.get(dfIdx) : null;
+            String str = null;
+            switch (sid) {
+
+                /* FormulaRecord: Cell value from a formula */
+                case FormulaRecord.sid:
+                    FormulaRecord formulaRecord = (FormulaRecord)record;
+                    double fmlValue = formulaRecord.getValue();
+                    str = toString(fmlValue, dfIdx, dataFormat);
+                    str = addText(str);
+                    break;
+
+                /* LabelRecord: strings stored directly in the cell */
+                case LabelRecord.sid:
+                    LabelRecord labelRecord = (LabelRecord)record;
+                    str = addText(labelRecord.getValue());
+                    break;
+
+                /* LabelSSTRecord: Ref. a string in the shared string table */
+                case LabelSSTRecord.sid:
+                    LabelSSTRecord labelSSTRecord = (LabelSSTRecord)record;
+                    int sstIndex = labelSSTRecord.getSSTIndex();
+                    UnicodeString unicodeStr = sstRecord.getString(sstIndex);
+                    str = addText(unicodeStr.getString());
+                    break;
+
+                /* NumberRecord: Contains a numeric cell value */
+                case NumberRecord.sid:
+                    double numValue = ((NumberRecord)record).getValue();
+                    if (!Double.isNaN(numValue)) {
+                        str = Double.toString(numValue);
+                    }
+                    str = toString(numValue, dfIdx, dataFormat);
+                    str = addText(str);
+                    break;
+
+                /* RKRecord: Excel internal number record */
+                case RKRecord.sid:
+                    double rkValue = ((RKRecord)record).getRKNumber();
+                    str = toString(rkValue, dfIdx, dataFormat);
+                    str = addText(str);
+                    break;
+            }
+
+            // =========== Debug Mess: START ===========
+            if (log.isDebugEnabled()) {
+                StringBuilder builder = new StringBuilder();
+                builder.append('[');
+                // builder.append(ExcelUtils.columnIndexToLabel(record.getColumn()));
+                builder.append(record.getColumn());
+                builder.append(":");
+                builder.append((record.getRow() + 1));
+                builder.append(']');
+                if (dfIdx != null) {
+                    builder.append(" xfIdx[");
+                    builder.append(xfIdx).append(']');
+                    builder.append("=dfIdx[");
+                    builder.append(dfIdx);
+                    builder.append(']');
+                    if (dataFormat != null) {
+                        builder.append("=[");
+                        builder.append(dataFormat);
+                        builder.append(']');
+                    }
+                }
+                builder.append(", value=[");
+                if (str != null && str.length() > 0) {
+                    builder.append(str);
+                }
+                builder.append(']');
+                debug((Record)record, builder.toString());
+            }
+            // =========== Debug Mess: END =============
+        }
+
+        /**
+         * Converts a numeric excel cell value to a String.
+         *
+         * @param value The cell value
+         * @param dfIdx The data format index
+         * @param dataFormat The data format
+         * @return Formatted string value
+         */
+        private String toString(double value, Short dfIdx, String dataFormat) {
+            if (Double.isNaN(value)) {
+                return null;
+            }
+
+            // **** TODO: Data Format parsing ****
+            // return ExcelUtils.format(value, dfIdx, dataFormat, use1904windowing);
+            return Double.toString(value);
+        }
+
+        /**
+         * Add a parsed text value to this listners appendable.
+         * <p>
+         * Null and zero length values are ignored.
+         *
+         * @param text The text value
+         * @return the added text
+         */
+        private String addText(String text) {
+            if (text != null) {
+                text = text.trim();
+                if (text.length() > 0) {
+                    try {
+                        if (!firstElement) {
+                            appendable.append(" ");
+                        }
+                        appendable.append(text);
+                    } catch (Exception e) {
+                        throw new RuntimeException(e);
+                    }
+                    firstElement = false;
+                }
+            }
+            return text;
+        }
+
+        /**
+         * Record debugging.
+         *
+         * @param record The Record
+         */
+        private void debug(Record record) {
+            debug(record, "");
+        }
+
+        /**
+         * Record debugging.
+         *
+         * @param record The Record
+         * @param msg Debug Message
+         */
+        private void debug(Record record, String msg) {
+            if (log.isDebugEnabled()) {
+                String className = record.getClass().getSimpleName();
+                String text = (msg == null ? className :  className + msg);
+                if (record.getSid() == BOFRecord.sid ||
+                    record.getSid() == EOFRecord.sid) {
+                    log.debug(text);
+                } else {
+                    log.debug("    " + text);
+                }
+            }
+        }
+    }
+}