You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 02:02:09 UTC

svn commit: r891092 - in /lucene/tika/trunk/tika-parsers: ./ src/main/java/org/apache/tika/parser/microsoft/ooxml/

Author: jukka
Date: Wed Dec 16 01:02:09 2009
New Revision: 891092

URL: http://svn.apache.org/viewvc?rev=891092&view=rev
Log:
TIKA-353: Upgrade to POI 3.6

Modified:
    lucene/tika/trunk/tika-parsers/pom.xml
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java

Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Wed Dec 16 01:02:09 2009
@@ -35,7 +35,7 @@
   <url>http://lucene.apache.org/tika/</url>
 
   <properties>
-    <poi.version>3.5-FINAL</poi.version>
+    <poi.version>3.6</poi.version>
   </properties>
 
   <dependencies>

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Wed Dec 16 01:02:09 2009
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.util.Locale;
+
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.xslf.XSLFSlideShow;
@@ -31,7 +33,8 @@
  */
 public class OOXMLExtractorFactory {
 
-    public static OOXMLExtractor createExtractor(POIXMLTextExtractor extractor) {
+    public static OOXMLExtractor createExtractor(
+            POIXMLTextExtractor extractor, Locale locale) {
         POIXMLDocument document = extractor.getDocument();
 
         if (document instanceof XSLFSlideShow) {
@@ -39,7 +42,7 @@
                     (XSLFPowerPointExtractor) extractor);
         } else if (document instanceof XSSFWorkbook) {
             return new XSSFExcelExtractorDecorator(
-                    (XSSFExcelExtractor) extractor);
+                    (XSSFExcelExtractor) extractor, locale);
         } else if (document instanceof XWPFDocument) {
             return new XWPFWordExtractorDecorator((XWPFWordExtractor) extractor);
         } else {

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Wed Dec 16 01:02:09 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Locale;
 
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.extractor.ExtractorFactory;
@@ -42,9 +43,10 @@
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         try {
-            OOXMLExtractor extractor = OOXMLExtractorFactory
-                    .createExtractor((POIXMLTextExtractor) ExtractorFactory
-                            .createExtractor(stream));
+            Locale locale = context.get(Locale.class, Locale.getDefault());
+            OOXMLExtractor extractor = OOXMLExtractorFactory.createExtractor(
+                    (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream),
+                    locale);
             extractor.getMetadataExtractor().extract(metadata);
             extractor.getXHTML(handler, metadata);
         } catch (InvalidFormatException e) {

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Wed Dec 16 01:02:09 2009
@@ -17,7 +17,9 @@
 package org.apache.tika.parser.microsoft.ooxml;
 
 import java.io.IOException;
+import java.text.NumberFormat;
 import java.util.Iterator;
+import java.util.Locale;
 
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.ss.usermodel.Cell;
@@ -34,8 +36,18 @@
 
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
 
-    public XSSFExcelExtractorDecorator(XSSFExcelExtractor extractor) {
+    /**
+     * Format for rendering numbers in the worksheet. Currently we just
+     * use the platform default formatting.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
+     */
+    private final NumberFormat format;
+
+    public XSSFExcelExtractorDecorator(
+            XSSFExcelExtractor extractor, Locale locale) {
         super(extractor, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+        this.format = NumberFormat.getInstance(locale);
     }
 
     /**
@@ -67,10 +79,16 @@
                     xhtml.startElement("td");
                     Cell cell = ri.next();
 
-                    if (cell.getCellType() == Cell.CELL_TYPE_FORMULA
-                            || cell.getCellType() == Cell.CELL_TYPE_STRING) {
+                    int type = cell.getCellType();
+                    if (type == Cell.CELL_TYPE_FORMULA) {
+                        type = cell.getCachedFormulaResultType();
+                    }
+                    if (type == Cell.CELL_TYPE_STRING) {
                         xhtml.characters(cell.getRichStringCellValue()
                                 .getString());
+                    } else if (type == Cell.CELL_TYPE_NUMERIC) {
+                        xhtml.characters(
+                                format.format(cell.getNumericCellValue()));
                     } else {
                         XSSFCell xc = (XSSFCell) cell;
                         String rawValue = xc.getRawValue();