You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/02 23:09:21 UTC

[tika] branch branch_1x updated (11d99c3 -> 5276459)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 11d99c3  TIKA-3055 -- add an optional Preflight parser for PDFs -- fix bundle
     new d645ae7  TIKA-3058 process page-level xmp if it exists
     new 1c3ceac  TIKA-3035 -- tika-app's -z option should report progress to stdout not stderr
     new 5276459  TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   2 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java |   2 +-
 .../main/java/org/apache/tika/metadata/PDF.java    |   6 +++
 .../parser/microsoft/AbstractOfficeParser.java     |   5 ++
 .../tika/parser/microsoft/ExcelExtractor.java      |  18 +++----
 .../tika/parser/microsoft/OfficeParserConfig.java  |  18 +++++++
 .../parser/microsoft/TikaExcelDataFormatter.java   |  18 +++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |   4 ++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  52 +++++++++++++++------
 .../tika/parser/microsoft/ExcelParserTest.java     |  15 +++++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 +++++
 .../tika-config-custom-date-override.xml}          |   4 +-
 ...ic.xml => tika-config-custom-date-override.xml} |   4 +-
 .../test-documents/testEXCEL_dateFormats.xls       | Bin 0 -> 6144 bytes
 .../test-documents/testEXCEL_dateFormats.xlsx      | Bin 0 -> 8766 bytes
 15 files changed, 128 insertions(+), 31 deletions(-)
 copy tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/{tika-config-exclude-phonetic.xml => ooxml/tika-config-custom-date-override.xml} (88%)
 copy tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/{tika-config-exclude-phonetic.xml => tika-config-custom-date-override.xml} (88%)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls
 create mode 100644 tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx


[tika] 02/03: TIKA-3035 -- tika-app's -z option should report progress to stdout not stderr

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1c3ceac4bfb966a61bd08bc7f22b4fb8e43f6499
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 13:47:59 2020 -0500

    TIKA-3035 -- tika-app's -z option should report progress to stdout not stderr
---
 tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java     | 2 +-
 tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index f837e3c..8077114 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -1066,7 +1066,7 @@ public class TikaCLI {
                     throw new IOException("unable to create directory \"" + parent + "\"");
                 }
             }
-            System.err.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
+            System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
 
             try (FileOutputStream os = new FileOutputStream(outputFile)) {
                 if (inputStream instanceof TikaInputStream) {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 2fd3eaf..2790690 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -408,7 +408,7 @@ public class TikaCLITest {
         new File("subdir/foo.txt").delete();
         new File("subdir").delete();
         TikaCLI.main(params);
-        String content = errContent.toString(UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
         // clean up. TODO: These should be in target.
         new File("target/subdir/foo.txt").delete();


[tika] 01/03: TIKA-3058 process page-level xmp if it exists

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d645ae723b4d3df5a3bc559f420e58a8f3be8c49
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 13:42:07 2020 -0500

    TIKA-3058 process page-level xmp if it exists
---
 .../main/java/org/apache/tika/metadata/PDF.java    |  6 +++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 52 ++++++++++++++++------
 2 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 0220948..608d5df 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -89,6 +89,12 @@ public interface PDF {
     Property HAS_XMP = Property.internalBoolean(PDF_PREFIX+"hasXMP");
 
     /**
+     * If xmp is extracted by, e.g. the XMLProfiler, where did it come from?
+     * The document document catalog or a specific page...or?
+     */
+    Property XMP_LOCATION = Property.internalText(PDF_PREFIX+"xmpLocation");
+
+    /**
      * Has > 0 AcroForm fields
      */
     Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2e58123..c2eb77f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -100,6 +100,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.sas.SAS7BDATParser;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
@@ -146,6 +147,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
 
+    public static final String XMP_DOCUMENT_CATALOG_LOCATION = "documentCatalog";
+    public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
+
     /**
      * Format used for signature dates
      * TODO Make this thread-safe
@@ -202,25 +206,26 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             supportedTypes = embeddedParser.getSupportedTypes(context);
         }
 
-        if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
-            Metadata xmpMetadata = new Metadata();
-            xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
-            xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
-            if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata) &&
-                    supportedTypes.contains(XMP_MEDIA_TYPE)) {
-                InputStream is = null;
-                try {
-                    is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata();
+        if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
+            //try the main metadata
+            if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
+                try (InputStream is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata()) {
+                    extractXMPAsEmbeddedFile(is, XMP_DOCUMENT_CATALOG_LOCATION);
                 } catch (IOException e) {
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                 }
-                if (is != null) {
-                    try {
-                        parseMetadata(is, xmpMetadata);
-                    } finally {
-                        org.apache.tika.io.IOUtils.closeQuietly(is);
+            }
+            //now iterate through the pages
+            int pageNumber = 1;
+            for (PDPage page : pdfDocument.getPages()) {
+                if (page.getMetadata() != null) {
+                    try (InputStream is = page.getMetadata().exportXMPMetadata()) {
+                        extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX+pageNumber);
+                    } catch (IOException e) {
+                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                     }
                 }
+                pageNumber++;
             }
         }
 
@@ -248,6 +253,24 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private void extractXMPAsEmbeddedFile(InputStream is, String location) throws IOException, SAXException {
+        if (is == null) {
+            return;
+        }
+        Metadata xmpMetadata = new Metadata();
+        xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
+        xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+        xmpMetadata.set(PDF.XMP_LOCATION, location);
+        if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
+            try {
+                parseMetadata(is, xmpMetadata);
+            } finally {
+                org.apache.tika.io.IOUtils.closeQuietly(is);
+            }
+        }
+
+    }
+
     private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
         try {
             embeddedDocumentExtractor.parseEmbedded(
@@ -441,6 +464,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
                 unmappedUnicodeCharsPerPage);
 
+
         try {
             for (PDAnnotation annotation : page.getAnnotations()) {
 


[tika] 03/03: TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5276459dd3d52a41a4aaafe2d074c306d2d11b36
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 16:20:37 2020 -0500

    TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx
---
 .../parser/microsoft/AbstractOfficeParser.java     |   5 ++++
 .../tika/parser/microsoft/ExcelExtractor.java      |  18 ++++++------
 .../tika/parser/microsoft/OfficeParserConfig.java  |  18 ++++++++++++
 .../parser/microsoft/TikaExcelDataFormatter.java   |  18 ++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |   4 +++
 .../tika/parser/microsoft/ExcelParserTest.java     |  15 ++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 +++++++
 .../ooxml/tika-config-custom-date-override.xml     |  32 +++++++++++++++++++++
 .../microsoft/tika-config-custom-date-override.xml |  32 +++++++++++++++++++++
 .../test-documents/testEXCEL_dateFormats.xls       | Bin 0 -> 6144 bytes
 .../test-documents/testEXCEL_dateFormats.xlsx      | Bin 0 -> 8766 bytes
 11 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 83d8a64..3c51afa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -145,4 +145,9 @@ public abstract class AbstractOfficeParser extends AbstractParser {
     public void setByteArrayMaxOverride(int maxOverride) {
         IOUtils.setByteArrayMaxOverride(maxOverride);
     }
+
+    @Field
+    public void setDateFormatOverride(String format) {
+        defaultOfficeParserConfig.setDateOverrideFormat(format);
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 3ccd019..ddf6cd4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -66,6 +66,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -285,6 +286,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
             this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
             this.officeParserConfig = officeParserConfig;
+
+            this.tikaExcelDataFormatter.setDateFormatOverride(officeParserConfig.getDateFormatOverride());
         }
 
         /**
@@ -676,12 +679,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             }
         }
         private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener {
-            //TIKA-2025 -- use this to preserve large numbers in "General" format
-            //against the MS spec.
-            final TikaExcelGeneralFormat generalFormat;
             public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) {
                 super(childListener, locale);
-                generalFormat = new TikaExcelGeneralFormat(locale);
             }
 
             @Override
@@ -693,9 +692,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             @Override
             public String formatNumberDateCell(CellValueRecordInterface cell) {
                 String formatString = this.getFormatString(cell);
-                if (formatString != null && ! formatString.equals("General")) {
-                    return super.formatNumberDateCell(cell);
-                }
 
                 double value;
                 if(cell instanceof NumberRecord) {
@@ -704,10 +700,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                     if(!(cell instanceof FormulaRecord)) {
                         throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell);
                     }
-
                     value = ((FormulaRecord)cell).getValue();
                 }
-                return generalFormat.format(value);
+                if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) {
+                    return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+                } else if ("general".equalsIgnoreCase(formatString)) {
+                    return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+                }
+                return super.formatNumberDateCell(cell);
             }
         }
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index d2bc790..53498b1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,6 +36,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXPptxExtractor = false;
     private boolean extractAllAlternativesFromMSG;
 
+    private String dateOverrideFormat = null;
+
     /**
      * Sets whether or not MSOffice parsers should extract macros.
      * As of Tika 1.15, the default is <code>false</code>.
@@ -244,6 +246,22 @@ public class OfficeParserConfig implements Serializable {
     public void setIncludeSlideMasterContent(boolean includeSlideMasterContent) {
         this.includeSlideMasterContent = includeSlideMasterContent;
     }
+
+    public String getDateFormatOverride() {
+        return dateOverrideFormat;
+    }
+
+    /**
+     * A user may wish to override the date formats in xls and xlsx files.
+     * For example, a user might prefer 'yyyy-mm-dd' to 'mm/dd/yy'.
+     *
+     * Note: these formats are "Excel formats" not Java's SimpleDateFormat
+     *
+     * @param format
+     */
+    public void setDateOverrideFormat(String format) {
+        this.dateOverrideFormat = format;
+    }
 }
 
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
index 7144d73..4192381 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft;
 import java.util.Locale;
 
 import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.poi.util.LocaleUtil;
 
 /**
@@ -28,6 +29,8 @@ import org.apache.poi.util.LocaleUtil;
  */
 public class TikaExcelDataFormatter extends DataFormatter {
 
+    private String dateOverrideFormatString;
+
     public TikaExcelDataFormatter() {
         this(LocaleUtil.getUserLocale());
     }
@@ -38,4 +41,19 @@ public class TikaExcelDataFormatter extends DataFormatter {
         addFormat("general", new TikaExcelGeneralFormat(locale));
     }
 
+    @Override
+    public String formatRawCellContents(double value, int formatIndex, String formatString, boolean use1904Windowing) {
+        if (DateUtil.isADateFormat(formatIndex, formatString)) {
+            String activeDateFormatString = (dateOverrideFormatString == null) ? formatString : dateOverrideFormatString;
+            return super.formatRawCellContents(value, formatIndex, activeDateFormatString, use1904Windowing);
+        } else {
+            return super.formatRawCellContents(value, formatIndex, formatString, use1904Windowing);
+        }
+    }
+
+    public void setDateFormatOverride(String dateOverrideFormat) {
+        if (dateOverrideFormat != null && dateOverrideFormat.trim().length() > 0) {
+            this.dateOverrideFormatString = dateOverrideFormat;
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index a9f747a..d78240e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -99,6 +99,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         } else {
             formatter = new TikaExcelDataFormatter(locale);
         }
+        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
+        if (officeParserConfig != null) {
+            ((TikaExcelDataFormatter)formatter).setDateFormatOverride(officeParserConfig.getDateFormatOverride());
+        }
     }
 
     protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index c6b79b6..741398f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -38,6 +38,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
@@ -492,8 +493,8 @@ public class ExcelParserTest extends TikaTest {
         Locale locale = LocaleUtil.getUserLocale();
         DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
         //16 digit number is treated as scientific notation as is the 16 digit formula
-        assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E15</td>\t"+
-                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E15", xml);
+        assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
+                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", xml);
     }
 
     @Test
@@ -579,4 +580,14 @@ public class ExcelParserTest extends TikaTest {
         String xml = getXML("testEXCEL_WORKBOOK_in_capitals.xls").xml;
         assertContains("Inventarliste", xml);
     }
+
+    @Test
+    public void testDateFormat() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(
+                this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEXCEL_dateFormats.xls", p).xml;
+        assertContains("2018-09-20", xml);
+        assertContains("1996-08-10", xml);
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3fb3f98..09b0d25 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1776,6 +1776,17 @@ public class OOXMLParserTest extends TikaTest {
         pc.set(OfficeParserConfig.class, c);
         getRecursiveMetadata("testWORD_truncated.docx", pc);
     }
+
+    @Test
+    public void testDateFormat() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(
+                this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEXCEL_dateFormats.xlsx", p).xml;
+        System.out.println(xml);
+        assertContains("2018-09-20", xml);
+        assertContains("1996-08-10", xml);
+    }
 }
 
 
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls
new file mode 100644
index 0000000..d2f4a52
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls differ
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx
new file mode 100644
index 0000000..9c2e223
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx differ