You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/02 21:21:01 UTC

[tika] branch master updated (5b27a99 -> 0cdbbcb)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 5b27a99  TIKA-3058 process page-level xmp if it exists
     new 5a093aa  TIKA-3035 -- tika-app's -z option should report progress to stdout not stderr
     new 0cdbbcb  TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../src/main/java/org/apache/tika/cli/TikaCLI.java    |   2 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java    |   2 +-
 .../tika/parser/microsoft/AbstractOfficeParser.java   |   5 +++++
 .../apache/tika/parser/microsoft/ExcelExtractor.java  |  18 +++++++++---------
 .../tika/parser/microsoft/OfficeParserConfig.java     |  18 ++++++++++++++++++
 .../tika/parser/microsoft/TikaExcelDataFormatter.java |  18 ++++++++++++++++++
 .../microsoft/ooxml/XSSFExcelExtractorDecorator.java  |   4 ++++
 .../apache/tika/parser/microsoft/ExcelParserTest.java |  15 +++++++++++++--
 .../tika/parser/microsoft/ooxml/OOXMLParserTest.java  |  11 +++++++++++
 .../tika-config-custom-date-override.xml}             |   4 ++--
 ...netic.xml => tika-config-custom-date-override.xml} |   4 ++--
 .../test-documents/testEXCEL_dateFormats.xls          | Bin 0 -> 6144 bytes
 .../test-documents/testEXCEL_dateFormats.xlsx         | Bin 0 -> 8766 bytes
 13 files changed, 84 insertions(+), 17 deletions(-)
 copy tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/{tika-config-exclude-phonetic.xml => ooxml/tika-config-custom-date-override.xml} (88%)
 copy tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/{tika-config-exclude-phonetic.xml => tika-config-custom-date-override.xml} (88%)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls
 create mode 100644 tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx


[tika] 02/02: TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0cdbbcb45e2fde2963bcf65d485cd7c414779ae9
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 16:20:37 2020 -0500

    TIKA-3042 -- allow custom overriding of date formats in .xls and .xlsx
---
 .../parser/microsoft/AbstractOfficeParser.java     |   5 ++++
 .../tika/parser/microsoft/ExcelExtractor.java      |  18 ++++++------
 .../tika/parser/microsoft/OfficeParserConfig.java  |  18 ++++++++++++
 .../parser/microsoft/TikaExcelDataFormatter.java   |  18 ++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |   4 +++
 .../tika/parser/microsoft/ExcelParserTest.java     |  15 ++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 +++++++
 .../ooxml/tika-config-custom-date-override.xml     |  32 +++++++++++++++++++++
 .../microsoft/tika-config-custom-date-override.xml |  32 +++++++++++++++++++++
 .../test-documents/testEXCEL_dateFormats.xls       | Bin 0 -> 6144 bytes
 .../test-documents/testEXCEL_dateFormats.xlsx      | Bin 0 -> 8766 bytes
 11 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 83d8a64..3c51afa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -145,4 +145,9 @@ public abstract class AbstractOfficeParser extends AbstractParser {
     public void setByteArrayMaxOverride(int maxOverride) {
         IOUtils.setByteArrayMaxOverride(maxOverride);
     }
+
+    @Field
+    public void setDateFormatOverride(String format) {
+        defaultOfficeParserConfig.setDateOverrideFormat(format);
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 3ccd019..ddf6cd4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -66,6 +66,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -285,6 +286,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
             this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
             this.officeParserConfig = officeParserConfig;
+
+            this.tikaExcelDataFormatter.setDateFormatOverride(officeParserConfig.getDateFormatOverride());
         }
 
         /**
@@ -676,12 +679,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             }
         }
         private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener {
-            //TIKA-2025 -- use this to preserve large numbers in "General" format
-            //against the MS spec.
-            final TikaExcelGeneralFormat generalFormat;
             public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) {
                 super(childListener, locale);
-                generalFormat = new TikaExcelGeneralFormat(locale);
             }
 
             @Override
@@ -693,9 +692,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             @Override
             public String formatNumberDateCell(CellValueRecordInterface cell) {
                 String formatString = this.getFormatString(cell);
-                if (formatString != null && ! formatString.equals("General")) {
-                    return super.formatNumberDateCell(cell);
-                }
 
                 double value;
                 if(cell instanceof NumberRecord) {
@@ -704,10 +700,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                     if(!(cell instanceof FormulaRecord)) {
                         throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell);
                     }
-
                     value = ((FormulaRecord)cell).getValue();
                 }
-                return generalFormat.format(value);
+                if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) {
+                    return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+                } else if ("general".equalsIgnoreCase(formatString)) {
+                    return tikaExcelDataFormatter.formatRawCellContents(value, getFormatIndex(cell), formatString, false);
+                }
+                return super.formatNumberDateCell(cell);
             }
         }
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index d2bc790..53498b1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,6 +36,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXPptxExtractor = false;
     private boolean extractAllAlternativesFromMSG;
 
+    private String dateOverrideFormat = null;
+
     /**
      * Sets whether or not MSOffice parsers should extract macros.
      * As of Tika 1.15, the default is <code>false</code>.
@@ -244,6 +246,22 @@ public class OfficeParserConfig implements Serializable {
     public void setIncludeSlideMasterContent(boolean includeSlideMasterContent) {
         this.includeSlideMasterContent = includeSlideMasterContent;
     }
+
+    public String getDateFormatOverride() {
+        return dateOverrideFormat;
+    }
+
+    /**
+     * A user may wish to override the date formats in xls and xlsx files.
+     * For example, a user might prefer 'yyyy-mm-dd' to 'mm/dd/yy'.
+     *
+     * Note: these formats are "Excel formats" not Java's SimpleDateFormat
+     *
+     * @param format
+     */
+    public void setDateOverrideFormat(String format) {
+        this.dateOverrideFormat = format;
+    }
 }
 
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
index 7144d73..4192381 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft;
 import java.util.Locale;
 
 import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.poi.util.LocaleUtil;
 
 /**
@@ -28,6 +29,8 @@ import org.apache.poi.util.LocaleUtil;
  */
 public class TikaExcelDataFormatter extends DataFormatter {
 
+    private String dateOverrideFormatString;
+
     public TikaExcelDataFormatter() {
         this(LocaleUtil.getUserLocale());
     }
@@ -38,4 +41,19 @@ public class TikaExcelDataFormatter extends DataFormatter {
         addFormat("general", new TikaExcelGeneralFormat(locale));
     }
 
+    @Override
+    public String formatRawCellContents(double value, int formatIndex, String formatString, boolean use1904Windowing) {
+        if (DateUtil.isADateFormat(formatIndex, formatString)) {
+            String activeDateFormatString = (dateOverrideFormatString == null) ? formatString : dateOverrideFormatString;
+            return super.formatRawCellContents(value, formatIndex, activeDateFormatString, use1904Windowing);
+        } else {
+            return super.formatRawCellContents(value, formatIndex, formatString, use1904Windowing);
+        }
+    }
+
+    public void setDateFormatOverride(String dateOverrideFormat) {
+        if (dateOverrideFormat != null && dateOverrideFormat.trim().length() > 0) {
+            this.dateOverrideFormatString = dateOverrideFormat;
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 3b88522..1f906ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -98,6 +98,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         } else {
             formatter = new TikaExcelDataFormatter(locale);
         }
+        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
+        if (officeParserConfig != null) {
+            ((TikaExcelDataFormatter)formatter).setDateFormatOverride(officeParserConfig.getDateFormatOverride());
+        }
     }
 
     protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 88afba3..7fb8fb8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -38,6 +38,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
@@ -488,8 +489,8 @@ public class ExcelParserTest extends TikaTest {
         Locale locale = LocaleUtil.getUserLocale();
         DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
         //16 digit number is treated as scientific notation as is the 16 digit formula
-        assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E15</td>\t"+
-                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E15", xml);
+        assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
+                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", xml);
     }
 
     @Test
@@ -575,4 +576,14 @@ public class ExcelParserTest extends TikaTest {
         String xml = getXML("testEXCEL_WORKBOOK_in_capitals.xls").xml;
         assertContains("Inventarliste", xml);
     }
+
+    @Test
+    public void testDateFormat() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(
+                this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEXCEL_dateFormats.xls", p).xml;
+        assertContains("2018-09-20", xml);
+        assertContains("1996-08-10", xml);
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 0cdb4dc..66eb680 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1768,6 +1768,17 @@ public class OOXMLParserTest extends TikaTest {
         pc.set(OfficeParserConfig.class, c);
         getRecursiveMetadata("testWORD_truncated.docx", pc);
     }
+
+    @Test
+    public void testDateFormat() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(
+                this.getClass().getResourceAsStream("tika-config-custom-date-override.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEXCEL_dateFormats.xlsx", p).xml;
+        System.out.println(xml);
+        assertContains("2018-09-20", xml);
+        assertContains("1996-08-10", xml);
+    }
 }
 
 
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
new file mode 100644
index 0000000..b36054e
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="dateFormatOverride" type="string">yyyy-mm-dd</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls
new file mode 100644
index 0000000..d2f4a52
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xls differ
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx
new file mode 100644
index 0000000..9c2e223
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_dateFormats.xlsx differ


[tika] 01/02: TIKA-3035 -- tika-app's -z option should report progress to stdout not stderr

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5a093aa34a0abfb158f2801e69245d3501bbdee5
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 13:47:59 2020 -0500

    TIKA-3035 -- tika-app's -z option should report progress to stdout not stderr
---
 tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java     | 2 +-
 tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index bb3f91a..60cb05e 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -1051,7 +1051,7 @@ public class TikaCLI {
                     throw new IOException("unable to create directory \"" + parent + "\"");
                 }
             }
-            System.err.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
+            System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
 
             try (FileOutputStream os = new FileOutputStream(outputFile)) {
                 if (inputStream instanceof TikaInputStream) {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index ab17b68..6f598f5 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -408,7 +408,7 @@ public class TikaCLITest {
         new File("subdir/foo.txt").delete();
         new File("subdir").delete();
         TikaCLI.main(params);
-        String content = errContent.toString(UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
         // clean up. TODO: These should be in target.
         new File("target/subdir/foo.txt").delete();