You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/09 13:14:58 UTC
[tika] branch TIKA-2552 updated: TIKA-2552 -- Add back the 1.x
metadata variants
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-2552
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-2552 by this push:
new 6fbc59a TIKA-2552 -- Add back the 1.x metadata variants
6fbc59a is described below
commit 6fbc59a696b8254fde57e79130daa54837e0f136
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Aug 9 09:14:44 2018 -0400
TIKA-2552 -- Add back the 1.x metadata variants
---
.../test/java/org/apache/tika/cli/TikaCLITest.java | 4 +-
.../parser/microsoft/ooxml/MetadataExtractor.java | 83 +++++++++++++++-------
.../parser/microsoft/PowerPointParserTest.java | 6 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 17 +++--
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 6 +-
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 10 ++-
.../tika/server/RecursiveMetadataResourceTest.java | 2 +-
7 files changed, 85 insertions(+), 43 deletions(-)
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 4ac6074..8804b0c 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -471,12 +471,12 @@ public class TikaCLITest {
String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
- /* assertTrue(content.contains("[\n" +
+ assertTrue(content.contains("[\n" +
" {\n" +
" \"Application-Name\": \"Microsoft Office Word\",\n" +
" \"Application-Version\": \"15.0000\",\n" +
" \"Character Count\": \"28\",\n" +
- " \"Character-Count-With-Spaces\": \"31\","));*/
+ " \"Character-Count-With-Spaces\": \"31\","));
assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\""));
assertFalse(content.contains("X-TIKA:content"));
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 09556b0..e5da8ce 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -16,17 +16,12 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import java.math.BigDecimal;
-import java.util.Date;
-import java.util.Optional;
-
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
-import org.apache.poi.openxml4j.util.Nullable;
-import org.apache.poi.ss.formula.functions.T;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -42,6 +37,10 @@ import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import java.math.BigDecimal;
+import java.util.Date;
+import java.util.Optional;
+
/**
* OOXML metadata extractor.
* <p/>
@@ -87,7 +86,7 @@ public class MetadataExtractor {
.getIdentifierProperty());
addProperty(metadata, OfficeOpenXMLCore.SUBJECT,
propsHolder.getSubjectProperty());
- addProperty(metadata, Office.KEYWORDS, propsHolder
+ addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
.getKeywordsProperty());
setProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
.getLanguageProperty());
@@ -99,10 +98,20 @@ public class MetadataExtractor {
.getModifiedProperty());
setProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
.getRevisionProperty());
-
+ // TODO: Move to OO subject in Tika 2.0
+ setProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
+ propsHolder.getSubjectProperty());
setProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
setProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
+ // Legacy Tika-1.0 style stats
+ // TODO Remove these in Tika 2.0
+ setProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
+ setProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+ .getContentStatusProperty());
+ setProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+ .getRevisionProperty());
+ setProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
}
private void extractMetadata(POIXMLProperties.ExtendedProperties properties,
@@ -143,6 +152,23 @@ public class MetadataExtractor {
setProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
setProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
+
+ // Legacy Tika-1.0 style stats
+ // TODO Remove these in Tika 2.0
+ setProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
+ setProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
+ setProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+ setProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+ setProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
+ setProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+ setProperty(metadata, Metadata.TOTAL_TIME, totalTime);
+ setProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
+ setProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
+ setProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+ setProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
+ setProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
+ setProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
+ setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
}
private void extractMetadata(POIXMLProperties.CustomProperties properties,
@@ -230,19 +256,19 @@ public class MetadataExtractor {
}
private <T> void setProperty(Metadata metadata, Property property, Optional<T> nullableValue) {
- if (! nullableValue.isPresent()) {
+ if (!nullableValue.isPresent()) {
return;
}
T value = nullableValue.get();
- if (value instanceof Date) {
- metadata.set(property, (Date) value);
- } else if (value instanceof String) {
- metadata.set(property, (String) value);
- } else if (value instanceof Integer) {
- metadata.set(property, (Integer) value);
- } else if (value instanceof Double) {
- metadata.set(property, (Double) value);
- }
+ if (value instanceof Date) {
+ metadata.set(property, (Date) value);
+ } else if (value instanceof String) {
+ metadata.set(property, (String) value);
+ } else if (value instanceof Integer) {
+ metadata.set(property, (Integer) value);
+ } else if (value instanceof Double) {
+ metadata.set(property, (Double) value);
+ }
}
private <T> void addProperty(Metadata metadata, Property property, Optional<T> nullableValue) {
@@ -250,16 +276,19 @@ public class MetadataExtractor {
return;
}
T value = nullableValue.get();
- if (value instanceof String) {
- metadata.add(property, (String) value);
- } else {
- throw new IllegalArgumentException("Can't add property of class: "+nullableValue.getClass());
- }
+ if (value instanceof String) {
+ metadata.add(property, (String) value);
+ } else {
+ throw new IllegalArgumentException("Can't add property of class: " + nullableValue.getClass());
+ }
}
- private void setProperty(Metadata metadata, String name, Nullable<?> value) {
- if (value.getValue() != null) {
- setProperty(metadata, name, value.getValue().toString());
+
+ private void setProperty(Metadata metadata, String property, Optional<String> nullableValue) {
+ if (!nullableValue.isPresent()) {
+ return;
}
+ String value = nullableValue.get();
+ metadata.set(property, value);
}
private void setProperty(Metadata metadata, Property property, String value) {
@@ -287,7 +316,7 @@ public class MetadataExtractor {
}
private void addMultiProperty(Metadata metadata, Property property, Optional<String> value) {
- if (! value.isPresent()) {
+ if (!value.isPresent()) {
return;
}
SummaryExtractor.addMulti(metadata, property, value.get());
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 45a7cb4..1d62efd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -137,7 +137,7 @@ public class PowerPointParserTest extends TikaTest {
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
+ //TIKA-1171, POI-62591
//assertEquals(-1, content.indexOf("*"));
}
@@ -161,7 +161,7 @@ public class PowerPointParserTest extends TikaTest {
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
+ //TIKA-1171, POI-62591
//assertEquals(-1, content.indexOf("*"));
}
@@ -180,7 +180,7 @@ public class PowerPointParserTest extends TikaTest {
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
+ //TIKA-1171, POI-62591
//assertEquals(-1, content.indexOf("*"));
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 35e702f..a2eb625 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -679,11 +679,12 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Keyword1 Keyword2", content);
assertEquals("Keyword1 Keyword2",
- metadata.get(Office.KEYWORDS));
+ metadata.get(Metadata.KEYWORDS));
assertContains("Subject is here", content);
// TODO: Remove subject in Tika 2.0
-
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
assertEquals("Subject is here",
metadata.get(OfficeOpenXMLCore.SUBJECT));
@@ -749,9 +750,12 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Keyword1 Keyword2", xml);
assertEquals("Keyword1 Keyword2",
- metadata.get(Office.KEYWORDS));
+ metadata.get(Metadata.KEYWORDS));
assertContains("Subject is here", xml);
+ // TODO: Remove subject in Tika 2.0
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
assertEquals("Subject is here",
metadata.get(OfficeOpenXMLCore.SUBJECT));
@@ -855,6 +859,7 @@ public class OOXMLParserTest extends TikaTest {
assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED));
assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE));
+ assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME));
assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
assertEquals("3", metadata.get("custom:myCustomNumber"));
@@ -885,14 +890,16 @@ public class OOXMLParserTest extends TikaTest {
assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+ assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("2", metadata.get(Office.WORD_COUNT));
assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("My Keyword", metadata.get(Office.KEYWORDS));
+ assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
// TODO: Remove subject in Tika 2.0
-// assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("My subject", metadata.get(Metadata.SUBJECT));
assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 25fdcde..cb935c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -337,10 +337,12 @@ public class SXSLFExtractorTest extends TikaTest {
assertContains("Keyword1 Keyword2", xml);
assertEquals("Keyword1 Keyword2",
- metadata.get(Office.KEYWORDS));
+ metadata.get(Metadata.KEYWORDS));
assertContains("Subject is here", xml);
-
+ // TODO: Remove subject in Tika 2.0
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
assertEquals("Subject is here",
metadata.get(OfficeOpenXMLCore.SUBJECT));
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 34fb286..063a062 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -383,11 +383,13 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("Keyword1 Keyword2", content);
assertEquals("Keyword1 Keyword2",
- metadata.get(Office.KEYWORDS));
+ metadata.get(Metadata.KEYWORDS));
assertContains("Subject is here", content);
// TODO: Remove subject in Tika 2.0
assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("Subject is here",
metadata.get(OfficeOpenXMLCore.SUBJECT));
assertContains("Suddenly some Japanese text:", content);
@@ -422,14 +424,16 @@ public class SXWPFExtractorTest extends TikaTest {
assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+ assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("2", metadata.get(Office.WORD_COUNT));
assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("My Keyword", metadata.get(Office.KEYWORDS));
+ assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
// TODO: Remove subject in Tika 2.0
-// assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("My subject", metadata.get(Metadata.SUBJECT));
assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 5cf1fe7..0acd02b 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -81,7 +81,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
- // assertEquals("Microsoft Office Word", metadataList.get(0).get("Application-Name"));
+ assertEquals("Microsoft Office Word", metadataList.get(0).get("Application-Name"));
assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));