You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2010/01/08 17:35:47 UTC
svn commit: r897253 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Author: dmeikle
Date: Fri Jan 8 16:35:34 2010
New Revision: 897253
URL: http://svn.apache.org/viewvc?rev=897253&view=rev
Log:
TIKA-103: Addition of POI supported number/date formatting handling within ExcelParser
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=897253&r1=897252&r2=897253&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Fri Jan 8 16:35:34 2010
@@ -27,6 +27,7 @@
import java.util.SortedMap;
import java.util.TreeMap;
+import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@@ -42,7 +43,6 @@
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.TextObjectRecord;
import org.apache.poi.hssf.record.UnicodeString;
-//import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
@@ -116,34 +116,8 @@
protected void parse(
POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException {
- // Set up listener and register the records we want to process
- TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
- HSSFRequest hssfRequest = new HSSFRequest();
- if (listenForAllRecords) {
- hssfRequest.addListenerForAllRecords(listener);
- } else {
- hssfRequest.addListener(listener, BOFRecord.sid);
- hssfRequest.addListener(listener, EOFRecord.sid);
- hssfRequest.addListener(listener, DateWindow1904Record.sid);
- hssfRequest.addListener(listener, CountryRecord.sid);
- hssfRequest.addListener(listener, BoundSheetRecord.sid);
- hssfRequest.addListener(listener, FormatRecord.sid);
- hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
- hssfRequest.addListener(listener, SSTRecord.sid);
- hssfRequest.addListener(listener, FormulaRecord.sid);
- hssfRequest.addListener(listener, LabelRecord.sid);
- hssfRequest.addListener(listener, LabelSSTRecord.sid);
- hssfRequest.addListener(listener, NumberRecord.sid);
- hssfRequest.addListener(listener, RKRecord.sid);
- hssfRequest.addListener(listener, HyperlinkRecord.sid);
- hssfRequest.addListener(listener, TextObjectRecord.sid);
- }
-
- // Create event factory and process Workbook (fire events)
- DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
- HSSFEventFactory eventFactory = new HSSFEventFactory();
-
- eventFactory.processEvents(hssfRequest, documentInputStream);
+ TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
+ listener.processFile(filesystem, isListenForAllRecords());
listener.throwStoredException();
}
@@ -170,6 +144,12 @@
private SSTRecord sstRecord;
/**
+ * Internal <code>FormatTrackingHSSFListener</code> to handle cell
+ * formatting within the extraction.
+ */
+ private FormatTrackingHSSFListener formatListener;
+
+ /**
* List of worksheet names.
*/
private List<String> sheetNames = new ArrayList<String>();
@@ -206,6 +186,47 @@
}
/**
+ * Entry point to listener to start the processing of a file.
+ *
+ * @param filesystem POI file system.
+ * @param listenForAllRecords sets whether the listener is configured to listen
+ * for all records types or not.
+ * @throws IOException on any IO errors.
+ * @throws SAXException on any SAX parsing errors.
+ */
+ public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
+ throws IOException, SAXException {
+
+ // Set up listener and register the records we want to process
+ formatListener = new FormatTrackingHSSFListener(this);
+ HSSFRequest hssfRequest = new HSSFRequest();
+ if (listenForAllRecords) {
+ hssfRequest.addListenerForAllRecords(formatListener);
+ } else {
+ hssfRequest.addListener(formatListener, BOFRecord.sid);
+ hssfRequest.addListener(formatListener, EOFRecord.sid);
+ hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
+ hssfRequest.addListener(formatListener, CountryRecord.sid);
+ hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
+ hssfRequest.addListener(formatListener, SSTRecord.sid);
+ hssfRequest.addListener(formatListener, FormulaRecord.sid);
+ hssfRequest.addListener(formatListener, LabelRecord.sid);
+ hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
+ hssfRequest.addListener(formatListener, NumberRecord.sid);
+ hssfRequest.addListener(formatListener, RKRecord.sid);
+ hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
+ hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+ hssfRequest.addListener(formatListener, FormatRecord.sid);
+ hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
+ }
+
+ // Create event factory and process Workbook (fire events)
+ DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+ HSSFEventFactory eventFactory = new HSSFEventFactory();
+ eventFactory.processEvents(hssfRequest, documentInputStream);
+ }
+
+ /**
* Process a HSSF record.
*
* @param record HSSF Record
@@ -273,7 +294,7 @@
case NumberRecord.sid: // Contains a numeric cell value
NumberRecord number = (NumberRecord) record;
- addCell(record, new NumberCell(number.getValue(), format));
+ addTextCell(record, formatListener.formatNumberDateCell(number));
break;
case RKRecord.sid: // Excel internal number record
@@ -330,7 +351,7 @@
*
* @param record record that holds the text value
* @param text text content, may be <code>null</code>
- * @throws SAXException
+ * @throws SAXException
*/
private void addTextCell(Record record, String text) throws SAXException {
if (text != null) {
@@ -380,12 +401,13 @@
}
handler.endElement("td");
handler.endElement("tr");
-
+
// Sheet End
handler.endElement("tbody");
handler.endElement("table");
handler.endElement("div");
}
+
}
/**
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=897253&r1=897252&r2=897253&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Fri Jan 8 16:35:34 2010
@@ -52,4 +52,67 @@
}
}
+ public void testExcelParserFormatting() throws Exception {
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL-formats.xls");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+
+ // Number #,##0.00
+ assertTrue(content.contains("1,599.99"));
+ assertTrue(content.contains("-1,599.99"));
+
+ // Currency $#,##0.00;[Red]($#,##0.00)
+ assertTrue(content.contains("$1,599.99"));
+ assertTrue(content.contains("($1,599.99)"));
+
+ // Scientific 0.00E+00
+ assertTrue(content.contains("1.98E08"));
+ assertTrue(content.contains("-1.98E08"));
+
+ // Percentage
+ assertTrue(content.contains("2%"));
+ assertTrue(content.contains("2.50%"));
+
+ // Time Format: h:mm
+ assertTrue(content.contains("6:15"));
+ assertTrue(content.contains("18:15"));
+
+ // Date Format: d-mmm-yy
+ assertTrue(content.contains("17-May-07"));
+
+ // Below assertions represent outstanding formatting issues to be addressed
+ // they are included to allow the issues to be progressed with the Apache POI
+ // team - See TIKA-103.
+
+ /*************************************************************************
+ // Date Format: m/d/yy
+ assertTrue(content.contains("03/10/2009"));
+
+ // Date/Time Format
+ assertTrue(content.contains("19/01/2008 04:35"));
+
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertTrue(content.contains("19 dollars and .99 cents"));
+
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
+
+ // Fraction (2.5): # ?/?
+ assertTrue(content.contains("2 1 / 2"));
+ **************************************************************************/
+
+ } finally {
+ input.close();
+ }
+ }
+
}