You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/13 20:17:02 UTC

[tika] branch master updated: TIKA-2687

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new c0fb57d  TIKA-2687
c0fb57d is described below

commit c0fb57d9d20e8eb7cb77bce8742e4566a18f5db8
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Jul 13 16:16:50 2018 -0400

    TIKA-2687
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  82 +++++++++---
 .../test/java/org/apache/tika/cli/TikaCLITest.java | 145 ++++++++++++++++++---
 .../resources/test-data/testZip_absolutePath.zip   | Bin 0 -> 334 bytes
 .../test-data/testZip_overlappingNames.zip         | Bin 0 -> 276 bytes
 .../test/resources/test-data/testZip_relative.zip  | Bin 0 -> 192 bytes
 .../test/resources/test-data/testZip_zeroByte.zip  | Bin 0 -> 154 bytes
 .../apache/tika/example/ExtractEmbeddedFiles.java  |  12 +-
 .../tika/server/resource/UnpackerResource.java     |  26 +++-
 8 files changed, 224 insertions(+), 41 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 399152d..b712296 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -54,6 +54,7 @@ import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeSet;
+import java.util.UUID;
 
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
@@ -441,7 +442,13 @@ public class TikaCLI {
         } else if (arg.equals("-d") || arg.equals("--detect")) {
             type = DETECT;
         } else if (arg.startsWith("--extract-dir=")) {
-            extractDir = new File(arg.substring("--extract-dir=".length()));
+            String dirPath = arg.substring("--extract-dir=".length());
+            //if the user accidentally doesn't include
+            //a directory, set the directory to the cwd
+            if (dirPath.length() == 0) {
+                dirPath = ".";
+            }
+            extractDir = new File(dirPath);
         } else if (arg.equals("-z") || arg.equals("--extract")) {
             extractInlineImagesFromPDFs();
             type = NO_OUTPUT;
@@ -1027,31 +1034,20 @@ public class TikaCLI {
         }
 
         public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
-            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
 
-            if (name == null) {
-                name = "file" + count++;
-            }
             if (! inputStream.markSupported()) {
                 inputStream = TikaInputStream.get(inputStream);
             }
             MediaType contentType = detector.detect(inputStream, metadata);
 
-            if (name.indexOf('.')==-1 && contentType!=null) {
-                try {
-                    name += config.getMimeRepository().forName(
-                            contentType.toString()).getExtension();
-                } catch (MimeTypeException e) {
-                    e.printStackTrace();
-                }
+            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+            File outputFile = null;
+            if (name == null) {
+                name = "file" + count++;
             }
+            outputFile = getOutputFile(name, metadata, contentType);
 
-            String relID = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
-            if (relID != null && !name.startsWith(relID)) {
-                name = relID + "_" + name;
-            }
 
-            File outputFile = new File(extractDir, FilenameUtils.normalize(name));
             File parent = outputFile.getParentFile();
             if (!parent.exists()) {
                 if (!parent.mkdirs()) {
@@ -1088,6 +1084,58 @@ public class TikaCLI {
             }
         }
 
+        private File getOutputFile(String name, Metadata metadata, MediaType contentType) {
+            String ext = getExtension(contentType);
+            if (name.indexOf('.')==-1 && contentType!=null) {
+                name += ext;
+            }
+
+            String relID = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+            if (relID != null && !name.startsWith(relID)) {
+                name = relID + "_" + name;
+            }
+            //defensively do this so that we don't get an exception
+            //from FilenameUtils.normalize
+            name = name.replaceAll("\u0000", " ");
+            String normalizedName = FilenameUtils.normalize(name);
+
+            if (normalizedName == null) {
+                normalizedName = FilenameUtils.getName(name);
+            }
+
+            if (normalizedName == null) {
+                normalizedName = "file"+count++ +ext;
+            }
+            //strip off initial C:/ or ~/ or /
+            int prefixLength = FilenameUtils.getPrefixLength(normalizedName);
+            if (prefixLength > -1) {
+                normalizedName = normalizedName.substring(prefixLength);
+            }
+            File outputFile = new File(extractDir, normalizedName);
+            //if file already exists, prepend uuid
+            if (outputFile.exists()) {
+                String fileName = FilenameUtils.getName(normalizedName);
+                outputFile = new File(extractDir, UUID.randomUUID().toString()+"-"+fileName);
+            }
+            return outputFile;
+        }
+
+        private String getExtension(MediaType contentType) {
+            try {
+                String ext = config.getMimeRepository().forName(
+                        contentType.toString()).getExtension();
+                if (ext == null) {
+                    return ".bin";
+                } else {
+                    return ext;
+                }
+            } catch (MimeTypeException e) {
+                e.printStackTrace();
+            }
+            return ".bin";
+
+        }
+
         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
                 throws IOException {
             for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 5d102dc..b877edf 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -17,15 +17,25 @@
 package org.apache.tika.cli;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InvalidObjectException;
+import java.io.OutputStream;
 import java.io.PrintStream;
 import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipOutputStream;
 
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.FilenameUtils;
 import org.apache.tika.exception.TikaException;
 import org.junit.After;
 import org.junit.Before;
@@ -245,39 +255,74 @@ public class TikaCLITest {
     }
 
     @Test
-    public void testExtract() throws Exception {
+    public void testExtractSimple() throws Exception {
+        String[] expectedChildren = new String[]{
+                "MBD002B040A.cdx",
+                "file4.png",
+                "MBD002B0FA6_file5.bin",
+                "MBD00262FE3.txt",
+                "file0.emf"
+        };
+        testExtract("/coffee.xls", expectedChildren, 8);
+    }
+
+    @Test
+    public void testExtractAbsolute() throws Exception {
+        String[] expectedChildren = new String[] {
+                "dangerous/dont/touch.pl",
+        };
+        testExtract("testZip_absolutePath.zip", expectedChildren, 2);
+    }
+
+    @Test
+    public void testExtractRelative() throws Exception {
+        String[] expectedChildren = new String[] {
+                "touch.pl",
+        };
+        testExtract("testZip_relative.zip", expectedChildren);
+    }
+
+    @Test
+    public void testExtractOverlapping() throws Exception {
+        //there should be two files, one with a prepended uuid-f1.txt
+        String[] expectedChildren = new String[] {
+                "f1.txt",
+        };
+        testExtract("testZip_overlappingNames.zip", expectedChildren, 2);
+    }
+
+    @Test
+    public void testExtract0x00() throws Exception {
+        String[] expectedChildren = new String[] {
+                "dang erous.pl",
+        };
+        testExtract("testZip_zeroByte.zip", expectedChildren);
+    }
+
+    private void testExtract(String targetFile, String[] expectedChildrenFileNames) throws Exception {
+        testExtract(targetFile, expectedChildrenFileNames, expectedChildrenFileNames.length);
+    }
+    private void testExtract(String targetFile, String[] expectedChildrenFileNames, int expectedLength) throws Exception {
         File tempFile = File.createTempFile("tika-test-", "");
         tempFile.delete();
-        tempFile.mkdir(); // not really good method for production usage, but ok for tests
-        // google guava library has better solution
+        tempFile.mkdir();
 
         try {
-            String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/coffee.xls"};
+            String[] params = {"--extract-dir=" + tempFile.getAbsolutePath(), "-z", resourcePrefix + "/"+targetFile};
 
             TikaCLI.main(params);
 
             StringBuffer allFiles = new StringBuffer();
+            assertEquals(expectedLength, tempFile.list().length);
             for (String f : tempFile.list()) {
+
                 if (allFiles.length() > 0) allFiles.append(" : ");
                 allFiles.append(f);
             }
 
-            // ChemDraw file
-            File expectedCDX = new File(tempFile, "MBD002B040A.cdx");
-            // Image of the ChemDraw molecule
-            File expectedIMG = new File(tempFile, "file4.png");
-            // OLE10Native
-            File expectedOLE10 = new File(tempFile, "MBD002B0FA6_file5.bin");
-            // Something that really isnt a text file... Not sure what it is???
-            File expected262FE3 = new File(tempFile, "MBD00262FE3.txt");
-            // Image of one of the embedded resources
-            File expectedEMF = new File(tempFile, "file0.emf");
-
-            assertExtracted(expectedCDX, allFiles.toString());
-            assertExtracted(expectedIMG, allFiles.toString());
-            assertExtracted(expectedOLE10, allFiles.toString());
-            assertExtracted(expected262FE3, allFiles.toString());
-            assertExtracted(expectedEMF, allFiles.toString());
+            for (String expectedChildName : expectedChildrenFileNames) {
+                assertExtracted(new File(tempFile, expectedChildName), allFiles.toString());
+            }
         } finally {
             FileUtils.deleteDirectory(tempFile);
         }
@@ -508,5 +553,63 @@ public class TikaCLITest {
         assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
     }
 
-
+    @Test
+    public void testFileNameNormalization() throws Exception {
+        File z = new File("C:/data/testZip_zeroByte.zip");
+        OutputStream os = new FileOutputStream(z);
+        ZipOutputStream outputStream = new ZipOutputStream(os);
+        ZipEntry zipEntry = new ZipEntry("dang\u0000erous.pl");
+        outputStream.putNextEntry(zipEntry);
+        byte[] bytes = "hello world1".getBytes(StandardCharsets.UTF_8);
+        outputStream.write(bytes, 0,bytes.length);
+        outputStream.closeEntry();
+        outputStream.flush();
+        outputStream.close();
+
+        z = new File("C:/data/testZip_absolutePath.zip");
+        os = new FileOutputStream(z);
+        outputStream = new ZipOutputStream(os);
+        zipEntry = new ZipEntry("C:/dangerous/dont/touch.pl");
+        outputStream.putNextEntry(zipEntry);
+        bytes = "hello world2".getBytes(StandardCharsets.UTF_8);
+        outputStream.write(bytes, 0,bytes.length);
+        outputStream.closeEntry();
+        zipEntry = new ZipEntry("/dangerous/dont/touch.pl");
+        outputStream.putNextEntry(zipEntry);
+        bytes = "hello world3".getBytes(StandardCharsets.UTF_8);
+        outputStream.write(bytes, 0,bytes.length);
+        outputStream.closeEntry();
+
+        outputStream.flush();
+        outputStream.close();
+
+        z = new File("C:/data/testZip_relative.zip");
+        os = new FileOutputStream(z);
+        outputStream = new ZipOutputStream(os);
+        zipEntry = new ZipEntry("../../../dangerous/dont/touch.pl");
+        outputStream.putNextEntry(zipEntry);
+        bytes = "hello world3".getBytes(StandardCharsets.UTF_8);
+        outputStream.write(bytes, 0,bytes.length);
+        outputStream.closeEntry();
+        outputStream.flush();
+        outputStream.close();
+
+        z = new File("C:/data/testZip_overlappingNames.zip");
+        os = new FileOutputStream(z);
+        outputStream = new ZipOutputStream(os);
+        zipEntry = new ZipEntry("f1.txt");
+        outputStream.putNextEntry(zipEntry);
+        bytes = "hello world4".getBytes(StandardCharsets.UTF_8);
+        outputStream.write(bytes, 0,bytes.length);
+        outputStream.closeEntry();
+
+        zipEntry = new ZipEntry("../../../f1.txt");
+        outputStream.putNextEntry(zipEntry);
+        bytes = "hello world5".getBytes(StandardCharsets.UTF_8);
+        outputStream.write(bytes, 0,bytes.length);
+        outputStream.closeEntry();
+
+        outputStream.flush();
+        outputStream.close();
+    }
 }
diff --git a/tika-app/src/test/resources/test-data/testZip_absolutePath.zip b/tika-app/src/test/resources/test-data/testZip_absolutePath.zip
new file mode 100644
index 0000000..29affae
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testZip_absolutePath.zip differ
diff --git a/tika-app/src/test/resources/test-data/testZip_overlappingNames.zip b/tika-app/src/test/resources/test-data/testZip_overlappingNames.zip
new file mode 100644
index 0000000..62707c7
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testZip_overlappingNames.zip differ
diff --git a/tika-app/src/test/resources/test-data/testZip_relative.zip b/tika-app/src/test/resources/test-data/testZip_relative.zip
new file mode 100644
index 0000000..e48ed88
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testZip_relative.zip differ
diff --git a/tika-app/src/test/resources/test-data/testZip_zeroByte.zip b/tika-app/src/test/resources/test-data/testZip_zeroByte.zip
new file mode 100644
index 0000000..00dc42c
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testZip_zeroByte.zip differ
diff --git a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
index 773b9df..90acba5 100644
--- a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
+++ b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.UUID;
 
 import org.apache.commons.io.FilenameUtils;
 import org.apache.tika.config.TikaConfig;
@@ -83,6 +84,11 @@ public class ExtractEmbeddedFiles {
                 //make sure to select only the file name (not any directory paths
                 //that might be included in the name) and make sure
                 //to normalize the name
+                name = name.replaceAll("\u0000", " ");
+                int prefix = FilenameUtils.getPrefixLength(name);
+                if (prefix > -1) {
+                    name = name.substring(prefix);
+                }
                 name = FilenameUtils.normalize(FilenameUtils.getName(name));
             }
 
@@ -97,9 +103,11 @@ public class ExtractEmbeddedFiles {
                     e.printStackTrace();
                 }
             }
-            //should add check to make sure that you aren't overwriting a file
+
             Path outputFile = outputDir.resolve(name);
-            //do a better job than this of checking
+            if (Files.exists(outputFile)) {
+                outputFile = outputDir.resolve(UUID.randomUUID().toString()+"-"+name);
+            }
             Files.createDirectories(outputFile.getParent());
             Files.copy(stream, outputFile);
         }
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index 3189237..df06607 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -37,8 +37,10 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.UUID;
 
 import au.com.bytecode.opencsv.CSVWriter;
+import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.lang.mutable.MutableInt;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -220,7 +222,7 @@ public class UnpackerResource {
                 }
             }
 
-            final String finalName = name;
+            final String finalName = getFinalName(name, zout);
 
             if (data.length > 0) {
                 zout.put(finalName, data);
@@ -243,6 +245,28 @@ public class UnpackerResource {
             }
         }
 
+        private String getFinalName(String name, Map<String, byte[]> zout) {
+            name = name.replaceAll("\u0000", " ");
+            String normalizedName = FilenameUtils.normalize(name);
+
+            if (normalizedName == null) {
+                normalizedName = FilenameUtils.getName(name);
+            }
+
+            if (normalizedName == null) {
+                normalizedName = count.toString();
+            }
+            //strip off initial C:/ or ~/ or /
+            int prefixLength = FilenameUtils.getPrefixLength(normalizedName);
+            if (prefixLength > -1) {
+                normalizedName = normalizedName.substring(prefixLength);
+            }
+            if (zout.containsKey(normalizedName)) {
+                return UUID.randomUUID().toString()+"-"+normalizedName;
+            }
+            return normalizedName;
+        }
+
         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
                 throws IOException {
             for (Entry entry : sourceDir) {