You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/02 23:09:24 UTC

[tika] 03/03: TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5276459dd3d52a41a4aaafe2d074c306d2d11b36
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 16:20:37 2020 -0500

    TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx
---
 .../parser/microsoft/AbstractOfficeParser.java     |   5 ++++
 .../tika/parser/microsoft/ExcelExtractor.java      |  18 ++++++------
 .../tika/parser/microsoft/OfficeParserConfig.java  |  18 ++++++++++++
 .../parser/microsoft/TikaExcelDataFormatter.java   |  18 ++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |   4 +++
 .../tika/parser/microsoft/ExcelParserTest.java     |  15 ++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 +++++++
 .../ooxml/tika-config-custom-date-override.xml     |  32 +++++++++++++++++++++
 .../microsoft/tika-config-custom-date-override.xml |  32 +++++++++++++++++++++
 .../test-documents/testEXCEL_dateFormats.xls       | Bin 0 -> 6144 bytes
 .../test-documents/testEXCEL_dateFormats.xlsx      | Bin 0 -> 8766 bytes
 11 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 83d8a64..3c51afa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -145,4 +145,9 @@ public abstract class AbstractOfficeParser extends AbstractParser {
     public void setByteArrayMaxOverride(int maxOverride) {
         IOUtils.setByteArrayMaxOverride(maxOverride);
     }
+
+    @Field
+    public void setDateFormatOverride(String format) {
+        defaultOfficeParserConfig.setDateOverrideFormat(format);
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 3ccd019..ddf6cd4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -66,6 +66,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -285,6 +286,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
             this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
             this.officeParserConfig = officeParserConfig;
+
+            this.tikaExcelDataFormatter.setDateFormatOverride(officeParserConfig.getDateFormatOverride());
         }
 
         /**
@@ -676,12 +679,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             }
         }
         private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener {
-            //TIKA-2025 -- use this to preserve large numbers in "General" format
-            //against the MS spec.
-            final TikaExcelGeneralFormat generalFormat;
             public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) {
                 super(childListener, locale);
-                generalFormat = new TikaExcelGeneralFormat(locale);
             }
 
             @Override
@@ -693,9 +692,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             @Override
             public String formatNumberDateCell(CellValueRecordInterface cell) {
                 String formatString = this.getFormatString(cell);
-                if (formatString != null && ! formatString.equals("General")) {
-                    return super.formatNumberDateCell(cell);
-                }
 
                 double value;
                 if(cell instanceof NumberRecord) {
@@ -704,10 +700,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                     if(!(cell instanceof FormulaRecord)) {
                         throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell);
                     }
-
                     value = ((FormulaRecord)cell).getValue();
                 }
-                return generalFormat.format(value);
+                if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) {
+                    return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+                } else if ("general".equalsIgnoreCase(formatString)) {
+                    return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+                }
+                return super.formatNumberDateCell(cell);
             }
         }
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index d2bc790..53498b1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,6 +36,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXPptxExtractor = false;
     private boolean extractAllAlternativesFromMSG;
 
+    private String dateOverrideFormat = null;
+
     /**
      * Sets whether or not MSOffice parsers should extract macros.
      * As of Tika 1.15, the default is <code>false</code>.
@@ -244,6 +246,22 @@ public class OfficeParserConfig implements Serializable {
     public void setIncludeSlideMasterContent(boolean includeSlideMasterContent) {
         this.includeSlideMasterContent = includeSlideMasterContent;
     }
+
+    public String getDateFormatOverride() {
+        return dateOverrideFormat;
+    }
+
+    /**
+     * A user may wish to override the date formats in xls and xlsx files.
+     * For example, a user might prefer 'yyyy-mm-dd' to 'mm/dd/yy'.
+     *
+     * Note: these formats are "Excel formats" not Java's SimpleDateFormat
+     *
+     * @param format
+     */
+    public void setDateOverrideFormat(String format) {
+        this.dateOverrideFormat = format;
+    }
 }
 
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
index 7144d73..4192381 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft;
 import java.util.Locale;
 
 import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.poi.util.LocaleUtil;
 
 /**
@@ -28,6 +29,8 @@ import org.apache.poi.util.LocaleUtil;
  */
 public class TikaExcelDataFormatter extends DataFormatter {
 
+    private String dateOverrideFormatString;
+
     public TikaExcelDataFormatter() {
         this(LocaleUtil.getUserLocale());
     }
@@ -38,4 +41,19 @@ public class TikaExcelDataFormatter extends DataFormatter {
         addFormat("general", new TikaExcelGeneralFormat(locale));
     }
 
+    @Override
+    public String formatRawCellContents(double value, int formatIndex, String formatString, boolean use1904Windowing) {
+        if (DateUtil.isADateFormat(formatIndex, formatString)) {
+            String activeDateFormatString = (dateOverrideFormatString == null) ? formatString : dateOverrideFormatString;
+            return super.formatRawCellContents(value, formatIndex, activeDateFormatString, use1904Windowing);
+        } else {
+            return super.formatRawCellContents(value, formatIndex, formatString, use1904Windowing);
+        }
+    }
+
+    public void setDateFormatOverride(String dateOverrideFormat) {
+        if (dateOverrideFormat != null && dateOverrideFormat.trim().length() > 0) {
+            this.dateOverrideFormatString = dateOverrideFormat;
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index a9f747a..d78240e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -99,6 +99,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         } else {
             formatter = new TikaExcelDataFormatter(locale);
         }
+        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
+        if (officeParserConfig != null) {
+            ((TikaExcelDataFormatter)formatter).setDateFormatOverride(officeParserConfig.getDateFormatOverride());
+        }
     }
 
     protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index c6b79b6..741398f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -38,6 +38,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
@@ -492,8 +493,8 @@ public class ExcelParserTest extends TikaTest {
         Locale locale = LocaleUtil.getUserLocale();
         DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
         //16 digit number is treated as scientific notation as is the 16 digit formula
-        assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E15</td>\t"+
-                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E15", xml);
+        assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
+                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", xml);
     }
 
     @Test
@@ -579,4 +580,14 @@ public class ExcelParserTest extends TikaTest {
         String xml = getXML("testEXCEL_WORKBOOK_in_capitals.xls").xml;
         assertContains("Inventarliste", xml);
     }
+
+    @Test
+    public void testDateFormat() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(
+                this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEXCEL_dateFormats.xls", p).xml;
+        assertContains("2018-09-20", xml);
+        assertContains("1996-08-10", xml);
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3fb3f98..09b0d25 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1776,6 +1776,17 @@ public class OOXMLParserTest extends TikaTest {
         pc.set(OfficeParserConfig.class, c);
         getRecursiveMetadata("testWORD_truncated.docx", pc);
     }
+
+    @Test
+    public void testDateFormat() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(
+                this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEXCEL_dateFormats.xlsx", p).xml;
+        System.out.println(xml);
+        assertContains("2018-09-20", xml);
+        assertContains("1996-08-10", xml);
+    }
 }
 
 
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls
new file mode 100644
index 0000000..d2f4a52
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls differ
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx
new file mode 100644
index 0000000..9c2e223
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx differ