You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2010/01/08 17:35:47 UTC

svn commit: r897253 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java

Author: dmeikle
Date: Fri Jan  8 16:35:34 2010
New Revision: 897253

URL: http://svn.apache.org/viewvc?rev=897253&view=rev
Log:
TIKA-103: Addition of POI supported number/date formatting handling within ExcelParser

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=897253&r1=897252&r2=897253&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Fri Jan  8 16:35:34 2010
@@ -27,6 +27,7 @@
 import java.util.SortedMap;
 import java.util.TreeMap;
 
+import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@@ -42,7 +43,6 @@
 import org.apache.poi.hssf.record.HyperlinkRecord;
 import org.apache.poi.hssf.record.TextObjectRecord;
 import org.apache.poi.hssf.record.UnicodeString;
-//import org.apache.poi.hssf.record.HyperlinkRecord;  // FIXME - requires POI release
 import org.apache.poi.hssf.record.LabelRecord;
 import org.apache.poi.hssf.record.LabelSSTRecord;
 import org.apache.poi.hssf.record.NumberRecord;
@@ -116,34 +116,8 @@
     protected void parse(
             POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
             Locale locale) throws IOException, SAXException {
-        // Set up listener and register the records we want to process
-        TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
-        HSSFRequest hssfRequest = new HSSFRequest();
-        if (listenForAllRecords) {
-            hssfRequest.addListenerForAllRecords(listener);
-        } else {
-            hssfRequest.addListener(listener, BOFRecord.sid);
-            hssfRequest.addListener(listener, EOFRecord.sid);
-            hssfRequest.addListener(listener, DateWindow1904Record.sid);
-            hssfRequest.addListener(listener, CountryRecord.sid);
-            hssfRequest.addListener(listener, BoundSheetRecord.sid);
-            hssfRequest.addListener(listener, FormatRecord.sid);
-            hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
-            hssfRequest.addListener(listener, SSTRecord.sid);
-            hssfRequest.addListener(listener, FormulaRecord.sid);
-            hssfRequest.addListener(listener, LabelRecord.sid);
-            hssfRequest.addListener(listener, LabelSSTRecord.sid);
-            hssfRequest.addListener(listener, NumberRecord.sid);
-            hssfRequest.addListener(listener, RKRecord.sid);
-            hssfRequest.addListener(listener, HyperlinkRecord.sid);
-            hssfRequest.addListener(listener, TextObjectRecord.sid);
-        }
-
-        // Create event factory and process Workbook (fire events)
-        DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
-        HSSFEventFactory eventFactory = new HSSFEventFactory();
-
-        eventFactory.processEvents(hssfRequest, documentInputStream);
+    	TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
+    	listener.processFile(filesystem, isListenForAllRecords());
         listener.throwStoredException();
     }
 
@@ -170,6 +144,12 @@
         private SSTRecord sstRecord;
 
         /**
+         * Internal <code>FormatTrackingHSSFListener</code> to handle cell
+         * formatting within the extraction.
+         */
+        private FormatTrackingHSSFListener formatListener;
+
+        /**
          * List of worksheet names.
          */
         private List<String> sheetNames = new ArrayList<String>();
@@ -206,6 +186,47 @@
         }
 
         /**
+         * Entry point to listener to start the processing of a file.
+         *
+         * @param filesystem POI file system.
+         * @param listenForAllRecords sets whether the listener is configured to listen
+         * for all records types or not.
+         * @throws IOException on any IO errors.
+         * @throws SAXException on any SAX parsing errors.
+         */
+    	public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
+    		throws IOException,	SAXException {
+
+    		// Set up listener and register the records we want to process
+    		formatListener = new FormatTrackingHSSFListener(this);
+            HSSFRequest hssfRequest = new HSSFRequest();
+            if (listenForAllRecords) {
+                hssfRequest.addListenerForAllRecords(formatListener);
+            } else {
+                hssfRequest.addListener(formatListener, BOFRecord.sid);
+                hssfRequest.addListener(formatListener, EOFRecord.sid);
+                hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
+                hssfRequest.addListener(formatListener, CountryRecord.sid);
+                hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
+                hssfRequest.addListener(formatListener, SSTRecord.sid);
+                hssfRequest.addListener(formatListener, FormulaRecord.sid);
+                hssfRequest.addListener(formatListener, LabelRecord.sid);
+                hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
+                hssfRequest.addListener(formatListener, NumberRecord.sid);
+                hssfRequest.addListener(formatListener, RKRecord.sid);
+                hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
+                hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+                hssfRequest.addListener(formatListener, FormatRecord.sid);
+                hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
+            }
+
+            // Create event factory and process Workbook (fire events)
+            DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+            HSSFEventFactory eventFactory = new HSSFEventFactory();
+            eventFactory.processEvents(hssfRequest, documentInputStream);
+    	}
+
+        /**
          * Process a HSSF record.
          *
          * @param record HSSF Record
@@ -273,7 +294,7 @@
 
             case NumberRecord.sid: // Contains a numeric cell value
                 NumberRecord number = (NumberRecord) record;
-                addCell(record, new NumberCell(number.getValue(), format));
+                addTextCell(record, formatListener.formatNumberDateCell(number));
                 break;
 
             case RKRecord.sid: // Excel internal number record
@@ -330,7 +351,7 @@
          *
          * @param record record that holds the text value
          * @param text text content, may be <code>null</code>
-         * @throws SAXException 
+         * @throws SAXException
          */
         private void addTextCell(Record record, String text) throws SAXException {
             if (text != null) {
@@ -380,12 +401,13 @@
             }
             handler.endElement("td");
             handler.endElement("tr");
-            
+
             // Sheet End
             handler.endElement("tbody");
             handler.endElement("table");
             handler.endElement("div");
         }
+
     }
 
     /**

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=897253&r1=897252&r2=897253&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Fri Jan  8 16:35:34 2010
@@ -52,4 +52,67 @@
         }
     }
 
+    public void testExcelParserFormatting() throws Exception {
+        InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL-formats.xls");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/vnd.ms-excel",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            String content = handler.toString();
+
+            // Number #,##0.00
+            assertTrue(content.contains("1,599.99"));
+            assertTrue(content.contains("-1,599.99"));
+
+            // Currency $#,##0.00;[Red]($#,##0.00)
+            assertTrue(content.contains("$1,599.99"));
+            assertTrue(content.contains("($1,599.99)"));
+
+            // Scientific 0.00E+00
+            assertTrue(content.contains("1.98E08"));
+            assertTrue(content.contains("-1.98E08"));
+
+            // Percentage
+            assertTrue(content.contains("2%"));
+            assertTrue(content.contains("2.50%"));
+
+            // Time Format: h:mm
+            assertTrue(content.contains("6:15"));
+            assertTrue(content.contains("18:15"));
+
+            // Date Format: d-mmm-yy
+            assertTrue(content.contains("17-May-07"));
+
+            // Below assertions represent outstanding formatting issues to be addressed
+            // they are included to allow the issues to be progressed with the Apache POI
+            // team - See TIKA-103.
+
+            /*************************************************************************
+            // Date Format: m/d/yy
+            assertTrue(content.contains("03/10/2009"));
+
+            // Date/Time Format
+            assertTrue(content.contains("19/01/2008 04:35"));
+
+            // Custom Number (0 "dollars and" .00 "cents")
+            assertTrue(content.contains("19 dollars and .99 cents"));
+
+            // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+            assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
+
+            // Fraction (2.5): # ?/?
+            assertTrue(content.contains("2 1 / 2"));
+            **************************************************************************/
+
+        } finally {
+            input.close();
+        }
+    }
+
 }