You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/15 18:59:37 UTC

[tika] branch branch_1x updated (9b42784 -> 941a150)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 9b42784  TIKA-3104 -- add detection and parsing for xml based plist files
     new 7d749cf  TIKA-3130 -- add ICC prefix
     new 5ecb3b9  TIKA-3135 -- no need to spool the file for the metadata extractor's HeifParser
     new a11314f  writeLimit and maxEmbeddedResources for recursive parsing - add header (#326)
     new e57c832  TIKA-3134 -- fix bug and add unit tests
     new 941a150  fix merge conflicts and unit test

The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/tika/parser/image/HeifParser.java   |  8 +--
 .../tika/parser/image/ImageMetadataExtractor.java  | 14 +++--
 .../apache/tika/parser/jpeg/JpegParserTest.java    |  6 +++
 .../server/resource/RecursiveMetadataResource.java | 12 ++++-
 .../tika/server/RecursiveMetadataResourceTest.java | 61 ++++++++++++++++++++--
 5 files changed, 84 insertions(+), 17 deletions(-)


[tika] 05/05: fix merge conflicts and unit test

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 941a1505fb4cfc87cfdbd2a1305c02173055c6cb
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 15 14:59:06 2020 -0400

    fix merge conflicts and unit test
---
 .../main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java  | 2 +-
 .../test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 9e1bec3..e4f24cd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -80,7 +80,7 @@ public class ImageMetadataExtractor {
     private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
     private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
 
-    private static final String ICC_NS = "ICC" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    private static final String ICC_NS = "ICC" + Metadata.NAMESPACE_PREFIX_DELIMITER;
 
     private final Metadata metadata;
     private DirectoryHandler[] handlers;
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index a65efdc..544a602 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -370,7 +370,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         metadataList = JsonMetadataList.fromJson(reader);
         assertEquals(12, metadataList.size());
         assertEquals("true", metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
-        assertContains("When in the Course of human events it becomes necessary for one people",
+        assertContains("When in the Course of human events it becomes",// necessary for one people"
                 metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
         assertNotContained("to dissolve",
                 metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));


[tika] 03/05: writeLimit and maxEmbeddedResources for recursive parsing - add header (#326)

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a11314f8ab395074bc4d5e8a4ba35ef5c8c2012d
Author: Nicholas DiPiazza <ni...@lucidworks.com>
AuthorDate: Wed Jul 15 12:36:48 2020 -0500

    writeLimit and maxEmbeddedResources for recursive parsing - add header (#326)
    
    parameters so that this can be customized.
---
 .../tika/server/resource/RecursiveMetadataResource.java    | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 524759e..15aca64 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -139,10 +139,20 @@ public class RecursiveMetadataResource {
 		TikaResource.fillParseContext(context, httpHeaders, null);
 		TikaResource.logRequest(LOG, info, metadata);
 
-        BasicContentHandlerFactory.HANDLER_TYPE type =
+    int writeLimit = -1;
+    if (httpHeaders.containsKey("writeLimit")) {
+      writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
+    }
+
+    int maxEmbeddedResources = -1;
+    if (httpHeaders.containsKey("maxEmbeddedResources")) {
+      writeLimit = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
+    }
+
+    BasicContentHandlerFactory.HANDLER_TYPE type =
                 BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
 		RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-		        new BasicContentHandlerFactory(type, -1), -1);
+		        new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources);
 		try {
             TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
         } catch (SecurityException e) {


[tika] 01/05: TIKA-3130 -- add ICC prefix

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7d749cf0a8434a696fce8dd33a911e76520cd35a
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 15 13:27:18 2020 -0400

    TIKA-3130 -- add ICC prefix
---
 .../java/org/apache/tika/parser/image/ImageMetadataExtractor.java   | 6 ++++++
 .../src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java   | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 622f48a..1a57ce2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -49,6 +49,7 @@ import com.drew.metadata.exif.ExifReader;
 import com.drew.metadata.exif.ExifSubIFDDirectory;
 import com.drew.metadata.exif.ExifThumbnailDirectory;
 import com.drew.metadata.exif.GpsDirectory;
+import com.drew.metadata.icc.IccDirectory;
 import com.drew.metadata.iptc.IptcDirectory;
 import com.drew.metadata.jpeg.JpegCommentDirectory;
 import com.drew.metadata.jpeg.JpegDirectory;
@@ -77,6 +78,9 @@ public class ImageMetadataExtractor {
     //TODO: add this to the signatures from the actual parse
     private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
     private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
+
+    private static final String ICC_NS = "ICC" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
     private final Metadata metadata;
     private DirectoryHandler[] handlers;
 
@@ -308,6 +312,8 @@ public class ImageMetadataExtractor {
                         }
                         if (directory instanceof ExifDirectoryBase) {
                             metadata.set(directory.getName() + ":" + name, value);
+                        } else if (directory instanceof IccDirectory) {
+                            metadata.set(ICC_NS+name, value);
                         } else {
                             metadata.set(name, value);
                         }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index dd0d234..d39c2fb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.jpeg;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 import java.io.InputStream;
@@ -204,6 +205,11 @@ public class JpegParserTest {
         assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
         assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
         assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+        //ICC
+        assertEquals("IEC", metadata.get("ICC:Device manufacturer").trim());
+        assertNull(metadata.get("Device manufacturer"));
+
     }
 
     @Test


[tika] 02/05: TIKA-3135 -- no need to spool the file for the metadata extractor's HeifParser

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5ecb3b925a6afa472b2ad44ab2b32ae253faa3fe
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 15 13:34:54 2020 -0400

    TIKA-3135 -- no need to spool the file for the metadata extractor's HeifParser
---
 .../src/main/java/org/apache/tika/parser/image/HeifParser.java    | 8 +-------
 .../java/org/apache/tika/parser/image/ImageMetadataExtractor.java | 8 +++-----
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
index 9880d3c..8931c50 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
@@ -52,13 +52,7 @@ public class HeifParser extends AbstractParser {
 
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-            TikaInputStream tis = TikaInputStream.get(stream, tmp);
-            new ImageMetadataExtractor(metadata).parseHeif(tis.getFile());
-        } finally {
-            tmp.dispose();
-        }
+        new ImageMetadataExtractor(metadata).parseHeif(stream);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 1a57ce2..9e1bec3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -56,6 +56,7 @@ import com.drew.metadata.jpeg.JpegDirectory;
 import org.apache.jempbox.xmp.XMPMetadata;
 import org.apache.poi.util.IOUtils;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.IPTC;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
@@ -156,13 +157,10 @@ public class ImageMetadataExtractor {
         }
     }
 
-    public void parseHeif(File file) throws IOException, TikaException {
+    public void parseHeif(InputStream is) throws IOException, TikaException {
         try {
-            com.drew.metadata.Metadata heifMetadata = new com.drew.metadata.Metadata();
-            heifMetadata = HeifMetadataReader.readMetadata(new FileInputStream(file));
+            com.drew.metadata.Metadata heifMetadata = HeifMetadataReader.readMetadata(is);
             handle(heifMetadata);
-        } catch (IOException e) {
-            throw e;
         } catch (MetadataException e) {
             throw new TikaException("Can't process Heif data", e);
         }


[tika] 04/05: TIKA-3134 -- fix bug and add unit tests

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e57c832a56b7917ff6da01af129c909aaa2ccf69
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 15 14:27:28 2020 -0400

    TIKA-3134 -- fix bug and add unit tests
---
 .../server/resource/RecursiveMetadataResource.java | 18 +++----
 .../tika/server/RecursiveMetadataResourceTest.java | 61 ++++++++++++++++++++--
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 15aca64..07d20c5 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -139,17 +139,17 @@ public class RecursiveMetadataResource {
 		TikaResource.fillParseContext(context, httpHeaders, null);
 		TikaResource.logRequest(LOG, info, metadata);
 
-    int writeLimit = -1;
-    if (httpHeaders.containsKey("writeLimit")) {
-      writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
-    }
+        int writeLimit = -1;
+        if (httpHeaders.containsKey("writeLimit")) {
+            writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
+        }
 
-    int maxEmbeddedResources = -1;
-    if (httpHeaders.containsKey("maxEmbeddedResources")) {
-      writeLimit = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
-    }
+        int maxEmbeddedResources = -1;
+        if (httpHeaders.containsKey("maxEmbeddedResources")) {
+        maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
+        }
 
-    BasicContentHandlerFactory.HANDLER_TYPE type =
+        BasicContentHandlerFactory.HANDLER_TYPE type =
                 BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
 		RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
 		        new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources);
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index d43b741..a65efdc 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -18,6 +18,7 @@
 package org.apache.tika.server;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.TikaTest.assertNotContained;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
@@ -25,8 +26,6 @@ import static org.junit.Assert.assertTrue;
 
 import javax.ws.rs.core.Response;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
@@ -34,12 +33,10 @@ import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -323,4 +320,60 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         assertNull(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
     }
 
+    @Test
+    public void testEmbeddedResourceLimit() throws Exception {
+        for (int i : new int[]{0,1,5}) {
+            Response response = WebClient
+                    .create(endPoint + META_PATH)
+                    .accept("application/json")
+                    .header("maxEmbeddedResources", Integer.toString(i))
+                    .put(ClassLoader
+                            .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+            assertEquals(200, response.getStatus());
+            // Check results
+            Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+            assertEquals(i+1, metadataList.size());
+        }
+    }
+
+    @Test
+    public void testWriteLimit() throws Exception {
+        int writeLimit = 10;
+        Response response = WebClient
+                .create(endPoint + META_PATH)
+                .accept("application/json")
+                .header("writeLimit", Integer.toString(writeLimit))
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        assertEquals(200, response.getStatus());
+        // Check results
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(1, metadataList.size());
+        assertEquals("true", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+
+        //now try with a write limit of 100
+        writeLimit = 100;
+        response = WebClient
+                .create(endPoint + META_PATH)
+                .accept("application/json")
+                .header("writeLimit", Integer.toString(writeLimit))
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        assertEquals(200, response.getStatus());
+        // Check results
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        assertEquals("true", metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+        assertContains("When in the Course of human events it becomes necessary for one people",
+                metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertNotContained("to dissolve",
+                metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+    }
 }