You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/02 21:21:03 UTC
[tika] 02/02: TIKA-3042 -- allow custom overriding of date formats
in .xls and .xlsx
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0cdbbcb45e2fde2963bcf65d485cd7c414779ae9
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 16:20:37 2020 -0500
TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx
---
.../parser/microsoft/AbstractOfficeParser.java | 5 ++++
.../tika/parser/microsoft/ExcelExtractor.java | 18 ++++++------
.../tika/parser/microsoft/OfficeParserConfig.java | 18 ++++++++++++
.../parser/microsoft/TikaExcelDataFormatter.java | 18 ++++++++++++
.../ooxml/XSSFExcelExtractorDecorator.java | 4 +++
.../tika/parser/microsoft/ExcelParserTest.java | 15 ++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 11 +++++++
.../ooxml/tika-config-custom-date-override.xml | 32 +++++++++++++++++++++
.../microsoft/tika-config-custom-date-override.xml | 32 +++++++++++++++++++++
.../test-documents/testEXCEL_dateFormats.xls | Bin 0 -> 6144 bytes
.../test-documents/testEXCEL_dateFormats.xlsx | Bin 0 -> 8766 bytes
11 files changed, 142 insertions(+), 11 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 83d8a64..3c51afa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -145,4 +145,9 @@ public abstract class AbstractOfficeParser extends AbstractParser {
public void setByteArrayMaxOverride(int maxOverride) {
IOUtils.setByteArrayMaxOverride(maxOverride);
}
+
+ @Field
+ public void setDateFormatOverride(String format) {
+ defaultOfficeParserConfig.setDateOverrideFormat(format);
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 3ccd019..ddf6cd4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -66,6 +66,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -285,6 +286,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
this.officeParserConfig = officeParserConfig;
+
+ this.tikaExcelDataFormatter.setDateFormatOverride(officeParserConfig.getDateFormatOverride());
}
/**
@@ -676,12 +679,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
}
}
private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener {
- //TIKA-2025 -- use this to preserve large numbers in "General" format
- //against the MS spec.
- final TikaExcelGeneralFormat generalFormat;
public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) {
super(childListener, locale);
- generalFormat = new TikaExcelGeneralFormat(locale);
}
@Override
@@ -693,9 +692,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
@Override
public String formatNumberDateCell(CellValueRecordInterface cell) {
String formatString = this.getFormatString(cell);
- if (formatString != null && ! formatString.equals("General")) {
- return super.formatNumberDateCell(cell);
- }
double value;
if(cell instanceof NumberRecord) {
@@ -704,10 +700,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
if(!(cell instanceof FormulaRecord)) {
throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell);
}
-
value = ((FormulaRecord)cell).getValue();
}
- return generalFormat.format(value);
+ if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) {
+ return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+ } else if ("general".equalsIgnoreCase(formatString)) {
+ return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+ }
+ return super.formatNumberDateCell(cell);
}
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index d2bc790..53498b1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,6 +36,8 @@ public class OfficeParserConfig implements Serializable {
private boolean useSAXPptxExtractor = false;
private boolean extractAllAlternativesFromMSG;
+ private String dateOverrideFormat = null;
+
/**
* Sets whether or not MSOffice parsers should extract macros.
* As of Tika 1.15, the default is <code>false</code>.
@@ -244,6 +246,22 @@ public class OfficeParserConfig implements Serializable {
public void setIncludeSlideMasterContent(boolean includeSlideMasterContent) {
this.includeSlideMasterContent = includeSlideMasterContent;
}
+
+ public String getDateFormatOverride() {
+ return dateOverrideFormat;
+ }
+
+ /**
+ * A user may wish to override the date formats in xls and xlsx files.
+ * For example, a user might prefer 'yyyy-mm-dd' to 'mm/dd/yy'.
+ *
+ * Note: these formats are "Excel formats" not Java's SimpleDateFormat
+ *
+ * @param format
+ */
+ public void setDateOverrideFormat(String format) {
+ this.dateOverrideFormat = format;
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
index 7144d73..4192381 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft;
import java.util.Locale;
import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.util.LocaleUtil;
/**
@@ -28,6 +29,8 @@ import org.apache.poi.util.LocaleUtil;
*/
public class TikaExcelDataFormatter extends DataFormatter {
+ private String dateOverrideFormatString;
+
public TikaExcelDataFormatter() {
this(LocaleUtil.getUserLocale());
}
@@ -38,4 +41,19 @@ public class TikaExcelDataFormatter extends DataFormatter {
addFormat("general", new TikaExcelGeneralFormat(locale));
}
+ @Override
+ public String formatRawCellContents(double value, int formatIndex, String formatString, boolean use1904Windowing) {
+ if (DateUtil.isADateFormat(formatIndex, formatString)) {
+ String activeDateFormatString = (dateOverrideFormatString == null) ? formatString : dateOverrideFormatString;
+ return super.formatRawCellContents(value, formatIndex, activeDateFormatString, use1904Windowing);
+ } else {
+ return super.formatRawCellContents(value, formatIndex, formatString, use1904Windowing);
+ }
+ }
+
+ public void setDateFormatOverride(String dateOverrideFormat) {
+ if (dateOverrideFormat != null && dateOverrideFormat.trim().length() > 0) {
+ this.dateOverrideFormatString = dateOverrideFormat;
+ }
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 3b88522..1f906ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -98,6 +98,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
} else {
formatter = new TikaExcelDataFormatter(locale);
}
+ OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
+ if (officeParserConfig != null) {
+ ((TikaExcelDataFormatter)formatter).setDateFormatOverride(officeParserConfig.getDateFormatOverride());
+ }
}
protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 88afba3..7fb8fb8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -38,6 +38,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
@@ -488,8 +489,8 @@ public class ExcelParserTest extends TikaTest {
Locale locale = LocaleUtil.getUserLocale();
DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
//16 digit number is treated as scientific notation as is the 16 digit formula
- assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E15</td>\t"+
- "<td>1"+symbols.getDecimalSeparator()+"23456789012345E15", xml);
+ assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
+ "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", xml);
}
@Test
@@ -575,4 +576,14 @@ public class ExcelParserTest extends TikaTest {
String xml = getXML("testEXCEL_WORKBOOK_in_capitals.xls").xml;
assertContains("Inventarliste", xml);
}
+
+ @Test
+ public void testDateFormat() throws Exception {
+ TikaConfig tikaConfig = new TikaConfig(
+ this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+ Parser p = new AutoDetectParser(tikaConfig);
+ String xml = getXML("testEXCEL_dateFormats.xls", p).xml;
+ assertContains("2018-09-20", xml);
+ assertContains("1996-08-10", xml);
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 0cdb4dc..66eb680 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1768,6 +1768,17 @@ public class OOXMLParserTest extends TikaTest {
pc.set(OfficeParserConfig.class, c);
getRecursiveMetadata("testWORD_truncated.docx", pc);
}
+
+ @Test
+ public void testDateFormat() throws Exception {
+ TikaConfig tikaConfig = new TikaConfig(
+ this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+ Parser p = new AutoDetectParser(tikaConfig);
+ String xml = getXML("testEXCEL_dateFormats.xlsx", p).xml;
+ System.out.println(xml);
+ assertContains("2018-09-20", xml);
+ assertContains("1996-08-10", xml);
+ }
}
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls
new file mode 100644
index 0000000..d2f4a52
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls differ
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx
new file mode 100644
index 0000000..9c2e223
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx differ