You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/09 13:14:58 UTC

[tika] branch TIKA-2552 updated: TIKA-2552 -- Add back the 1.x metadata variants

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-2552
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-2552 by this push:
     new 6fbc59a  TIKA-2552 -- Add back the 1.x metadata variants
6fbc59a is described below

commit 6fbc59a696b8254fde57e79130daa54837e0f136
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Aug 9 09:14:44 2018 -0400

    TIKA-2552 -- Add back the 1.x metadata variants
---
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  4 +-
 .../parser/microsoft/ooxml/MetadataExtractor.java  | 83 +++++++++++++++-------
 .../parser/microsoft/PowerPointParserTest.java     |  6 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    | 17 +++--
 .../parser/microsoft/ooxml/SXSLFExtractorTest.java |  6 +-
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 10 ++-
 .../tika/server/RecursiveMetadataResourceTest.java |  2 +-
 7 files changed, 85 insertions(+), 43 deletions(-)

diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 4ac6074..8804b0c 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -471,12 +471,12 @@ public class TikaCLITest {
         String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
         TikaCLI.main(params);
         String content = outContent.toString(UTF_8.name());
-       /* assertTrue(content.contains("[\n" +
+        assertTrue(content.contains("[\n" +
                 "  {\n" +
                 "    \"Application-Name\": \"Microsoft Office Word\",\n" +
                 "    \"Application-Version\": \"15.0000\",\n" +
                 "    \"Character Count\": \"28\",\n" +
-                "    \"Character-Count-With-Spaces\": \"31\","));*/
+                "    \"Character-Count-With-Spaces\": \"31\","));
         assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\""));
         assertFalse(content.contains("X-TIKA:content"));
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 09556b0..e5da8ce 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -16,17 +16,12 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import java.math.BigDecimal;
-import java.util.Date;
-import java.util.Optional;
-
 import org.apache.poi.ooxml.POIXMLProperties;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
-import org.apache.poi.openxml4j.util.Nullable;
-import org.apache.poi.ss.formula.functions.T;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -42,6 +37,10 @@ import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
 import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
 
+import java.math.BigDecimal;
+import java.util.Date;
+import java.util.Optional;
+
 /**
  * OOXML metadata extractor.
  * <p/>
@@ -87,7 +86,7 @@ public class MetadataExtractor {
                 .getIdentifierProperty());
         addProperty(metadata, OfficeOpenXMLCore.SUBJECT,
                 propsHolder.getSubjectProperty());
-        addProperty(metadata, Office.KEYWORDS, propsHolder
+        addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
                 .getKeywordsProperty());
         setProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
                 .getLanguageProperty());
@@ -99,10 +98,20 @@ public class MetadataExtractor {
                 .getModifiedProperty());
         setProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
                 .getRevisionProperty());
-
+        // TODO: Move to OO subject in Tika 2.0
+        setProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
+                propsHolder.getSubjectProperty());
         setProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
         setProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
 
+        // Legacy Tika-1.0 style stats
+        // TODO Remove these in Tika 2.0
+        setProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
+        setProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+                .getContentStatusProperty());
+        setProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+                .getRevisionProperty());
+        setProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
     }
 
     private void extractMetadata(POIXMLProperties.ExtendedProperties properties,
@@ -143,6 +152,23 @@ public class MetadataExtractor {
         setProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
         setProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
         setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
+
+        // Legacy Tika-1.0 style stats
+        // TODO Remove these in Tika 2.0
+        setProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
+        setProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
+        setProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+        setProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+        setProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
+        setProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+        setProperty(metadata, Metadata.TOTAL_TIME, totalTime);
+        setProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
+        setProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
+        setProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+        setProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
+        setProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
+        setProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
+        setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
     }
 
     private void extractMetadata(POIXMLProperties.CustomProperties properties,
@@ -230,19 +256,19 @@ public class MetadataExtractor {
     }
 
     private <T> void setProperty(Metadata metadata, Property property, Optional<T> nullableValue) {
-        if (! nullableValue.isPresent()) {
+        if (!nullableValue.isPresent()) {
             return;
         }
         T value = nullableValue.get();
-            if (value instanceof Date) {
-                metadata.set(property, (Date) value);
-            } else if (value instanceof String) {
-                metadata.set(property, (String) value);
-            } else if (value instanceof Integer) {
-                metadata.set(property, (Integer) value);
-            } else if (value instanceof Double) {
-                metadata.set(property, (Double) value);
-            }
+        if (value instanceof Date) {
+            metadata.set(property, (Date) value);
+        } else if (value instanceof String) {
+            metadata.set(property, (String) value);
+        } else if (value instanceof Integer) {
+            metadata.set(property, (Integer) value);
+        } else if (value instanceof Double) {
+            metadata.set(property, (Double) value);
+        }
     }
 
     private <T> void addProperty(Metadata metadata, Property property, Optional<T> nullableValue) {
@@ -250,16 +276,19 @@ public class MetadataExtractor {
             return;
         }
         T value = nullableValue.get();
-            if (value instanceof String) {
-                metadata.add(property, (String) value);
-            } else {
-                throw new IllegalArgumentException("Can't add property of class: "+nullableValue.getClass());
-            }
+        if (value instanceof String) {
+            metadata.add(property, (String) value);
+        } else {
+            throw new IllegalArgumentException("Can't add property of class: " + nullableValue.getClass());
+        }
     }
-    private void setProperty(Metadata metadata, String name, Nullable<?> value) {
-        if (value.getValue() != null) {
-            setProperty(metadata, name, value.getValue().toString());
+
+    private void setProperty(Metadata metadata, String property, Optional<String> nullableValue) {
+        if (!nullableValue.isPresent()) {
+            return;
         }
+        String value = nullableValue.get();
+        metadata.set(property, value);
     }
 
     private void setProperty(Metadata metadata, Property property, String value) {
@@ -287,7 +316,7 @@ public class MetadataExtractor {
     }
 
     private void addMultiProperty(Metadata metadata, Property property, Optional<String> value) {
-        if (! value.isPresent()) {
+        if (!value.isPresent()) {
             return;
         }
         SummaryExtractor.addMulti(metadata, property, value.get());
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 45a7cb4..1d62efd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -137,7 +137,7 @@ public class PowerPointParserTest extends TikaTest {
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
 
-        //TIKA-1171
+        //TIKA-1171, POI-62591
         //assertEquals(-1, content.indexOf("*"));
     }
 
@@ -161,7 +161,7 @@ public class PowerPointParserTest extends TikaTest {
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
 
-        //TIKA-1171
+        //TIKA-1171, POI-62591
         //assertEquals(-1, content.indexOf("*"));
     }
 
@@ -180,7 +180,7 @@ public class PowerPointParserTest extends TikaTest {
 
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
-        //TIKA-1171
+        //TIKA-1171, POI-62591
         //assertEquals(-1, content.indexOf("*"));
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 35e702f..a2eb625 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -679,11 +679,12 @@ public class OOXMLParserTest extends TikaTest {
 
         assertContains("Keyword1 Keyword2", content);
         assertEquals("Keyword1 Keyword2",
-                metadata.get(Office.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
         assertContains("Subject is here", content);
         // TODO: Remove subject in Tika 2.0
-
+        assertEquals("Subject is here",
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
 
@@ -749,9 +750,12 @@ public class OOXMLParserTest extends TikaTest {
 
         assertContains("Keyword1 Keyword2", xml);
         assertEquals("Keyword1 Keyword2",
-                metadata.get(Office.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
         assertContains("Subject is here", xml);
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
 
@@ -855,6 +859,7 @@ public class OOXMLParserTest extends TikaTest {
         assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED));
         assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED));
         assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE));
+        assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME));
         assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
         assertEquals("true", metadata.get("custom:myCustomBoolean"));
         assertEquals("3", metadata.get("custom:myCustomNumber"));
@@ -885,14 +890,16 @@ public class OOXMLParserTest extends TikaTest {
         assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
         assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
         assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+        assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
         assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
         assertEquals("1", metadata.get(Office.PAGE_COUNT));
         assertEquals("2", metadata.get(Office.WORD_COUNT));
         assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("My Keyword", metadata.get(Office.KEYWORDS));
+        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
         assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
         // TODO: Remove subject in Tika 2.0
-//        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
         assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
         assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
         assertEquals("true", metadata.get("custom:myCustomBoolean"));
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 25fdcde..cb935c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -337,10 +337,12 @@ public class SXSLFExtractorTest extends TikaTest {
 
         assertContains("Keyword1 Keyword2", xml);
         assertEquals("Keyword1 Keyword2",
-                metadata.get(Office.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
         assertContains("Subject is here", xml);
-
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 34fb286..063a062 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -383,11 +383,13 @@ public class SXWPFExtractorTest extends TikaTest {
 
         assertContains("Keyword1 Keyword2", content);
         assertEquals("Keyword1 Keyword2",
-                metadata.get(Office.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
         assertContains("Subject is here", content);
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
+                metadata.get(Metadata.SUBJECT));
+        assertEquals("Subject is here",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
 
         assertContains("Suddenly some Japanese text:", content);
@@ -422,14 +424,16 @@ public class SXWPFExtractorTest extends TikaTest {
         assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
         assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
         assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+        assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
         assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
         assertEquals("1", metadata.get(Office.PAGE_COUNT));
         assertEquals("2", metadata.get(Office.WORD_COUNT));
         assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("My Keyword", metadata.get(Office.KEYWORDS));
+        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
         assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
         // TODO: Remove subject in Tika 2.0
-//        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
         assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
         assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
         assertEquals("true", metadata.get("custom:myCustomBoolean"));
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 5cf1fe7..0acd02b 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -81,7 +81,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
 
         assertEquals(12, metadataList.size());
-       // assertEquals("Microsoft Office Word", metadataList.get(0).get("Application-Name"));
+        assertEquals("Microsoft Office Word", metadataList.get(0).get("Application-Name"));
         assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
 
         assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));