You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/30 20:22:08 UTC

[tika] branch master updated: improve file mangling

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 547c5b6  improve file mangling
547c5b6 is described below

commit 547c5b65461037fd8c76594f09f93cff7f8c0d7a
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 30 16:21:46 2020 -0400

    improve file mangling
---
 .../java/org/apache/tika/TestCorruptedFiles.java   | 143 ++++++++++++++++-----
 1 file changed, 110 insertions(+), 33 deletions(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/TestCorruptedFiles.java b/tika-parsers/src/test/java/org/apache/tika/TestCorruptedFiles.java
index 08d2976..13bf8da 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestCorruptedFiles.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestCorruptedFiles.java
@@ -28,7 +28,11 @@ import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.concurrent.Callable;
@@ -47,7 +51,6 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -64,7 +67,7 @@ import org.xml.sax.helpers.DefaultHandler;
  * unearth a large number of bugs.
  * </p>
  */
-@Ignore
+//@Ignore
 public class TestCorruptedFiles extends TikaTest {
 
     //I did the per_10000, because I wasn't able to reproduce
@@ -74,20 +77,23 @@ public class TestCorruptedFiles extends TikaTest {
     /**
      *  per 10,000 bytes, how many should be corrupted
      */
-    private static final int PER_10000_CORRUPTED = 100;
+    private static final int PER_10000_CORRUPTED = 1000;
 
     /**
      * per 10,000 iterations, how many should be truncated instead of corrupted
      */
-    private static final double PER_10000_TRUNCATED = 10;
+    private static final double PER_10000_TRUNCATED = 1000;
 
     /**
      * per 10,000 iterations, how many should have random bytes concatenated
      */
-    private static final double PER_10000_AUGMENTED = 10;
-
+    private static final double PER_10000_AUGMENTED = 1000;
 
     /**
+     * per 10,000 iterations, how many should have segments swapped
+     */
+    private static final double PER_10000_SWAPPED = 1000;
+    /**
      * how much time to allow for the parse
      */
     private static final int MAX_ALLOWABLE_TIME_MILLIS = 20000;
@@ -99,13 +105,14 @@ public class TestCorruptedFiles extends TikaTest {
 
     private static boolean HANDLE_EMBEDDED_DOCS_INDIVIDUALLY = true;
 
-    private static Random randomSeedGenerator = new Random();
+    private static Random RANDOM_SEED_GENERATOR = new Random();
     private static Path CORRUPTED;
     private static boolean FAILED;
 
     @BeforeClass
     public static void setUp() throws IOException {
         CORRUPTED = Files.createTempFile("tika-corrupted-",".tmp");
+        System.out.println("corrupted file: " + CORRUPTED);
     }
 
     @AfterClass
@@ -120,13 +127,18 @@ public class TestCorruptedFiles extends TikaTest {
     @Test
     public void testExtension() throws Throwable {
         Random r = new Random();
-        for (File f : getResourceAsFile("/test-documents").listFiles()) {
-            if (! f.isDirectory()) {
-                System.out.println("testing: "+f);
-                long seed = r.nextLong();
+        long seed = r.nextLong();
+        Random rand = new Random(seed);
+        List<File> files = Arrays.asList(getResourceAsFile("/test-documents").listFiles());
+        Collections.shuffle(files);
+        for (File f : files) {
+            if (! f.isDirectory()) {//&& f.getName().endsWith(".one")) {
                 for (int i = 0; i < NUM_ITERATIONS; i++) {
                     try {
-                        testSingleFile(getBytes(f.getName()), new Random(seed));
+                        FAILED = true;
+                        System.out.println("testing: "+f + " : "+i);
+                        testSingleFile(getBytes(f.getName()), rand);
+                        FAILED = false;
                     } catch (Throwable t) {
                         t.printStackTrace();
                         fail("error "+f.getName()+ " seed: "+seed + " : "+CORRUPTED);
@@ -142,7 +154,7 @@ public class TestCorruptedFiles extends TikaTest {
         long seed = 7850890625037579255l;
         try {
             for (int i = 0; i < NUM_ITERATIONS; i++) {
-                seed = randomSeedGenerator.nextLong();
+                seed = RANDOM_SEED_GENERATOR.nextLong();
                 FAILED = true;
                 testSingleFile(getBytes(fileName), new Random(seed));
                 FAILED = false;
@@ -160,7 +172,7 @@ public class TestCorruptedFiles extends TikaTest {
         long seed = 0;
         for (int i = 0; i < NUM_ITERATIONS; i++) {
             for (Map.Entry<String, byte[]> e : embedded.entrySet()) {
-                seed = randomSeedGenerator.nextLong();
+                seed = RANDOM_SEED_GENERATOR.nextLong();
                 try{
                     FAILED = true;
                     testSingleFile(e.getValue(), new Random(seed));
@@ -187,10 +199,54 @@ public class TestCorruptedFiles extends TikaTest {
         }
     }
 
+    @Test
+    public void testAllTruncated() throws Throwable {
+        Random r = new Random();
+        for (String fileName : new String[] {
+                "testOneNote1.one", "testOneNote2.one", "testOneNote3.one",
+                "testOneNote2007OrEarlier1.one", "testOneNote2007OrEarlier2.one"
+        }) {
+            byte[] bytes = getBytes(fileName);
+            int len = bytes.length;
+            for (int i = len; i > -1; i -= r.nextInt(1000)) {
+                if (i < 0) {
+                    break;
+                }
+                byte[] truncated = new byte[i];
+                System.arraycopy(bytes, 0, truncated, 0, i);
+                System.out.println("testing length: "+truncated.length + ": "+fileName);
+                try {
+                    FAILED = true;
+                    testSingleFile(truncated);
+                    FAILED = false;
+                } finally {
+
+                }
+
+            }
+        }
+    }
+
+    public void testSingleFile(byte[] bytes) throws Throwable {
+        ExecutorService executorService = Executors.newSingleThreadExecutor();
+        ExecutorCompletionService executorCompletionService = new ExecutorCompletionService(executorService);
+        executorCompletionService.submit(new ParseTask(bytes));
+        Future<Boolean> future = executorCompletionService.poll(MAX_ALLOWABLE_TIME_MILLIS, TimeUnit.MILLISECONDS);
+        if (future == null) {
+            throw new TimeoutException("timed out: "+CORRUPTED);
+        }
+
+        //if the exception isn't caught, it will be thrown here
+        Boolean result = future.get(1, TimeUnit.SECONDS);
+        if (result == null) {
+            throw new TimeoutException("timed out: " + CORRUPTED);
+        }
+    }
+
     public void testSingleFile(byte[] bytes, Random random) throws Throwable {
         ExecutorService executorService = Executors.newSingleThreadExecutor();
         ExecutorCompletionService executorCompletionService = new ExecutorCompletionService(executorService);
-        executorCompletionService.submit(new ParseTask(bytes, random));
+        executorCompletionService.submit(new CorruptAndParseTask(bytes, random));
         Future<Boolean> future = executorCompletionService.poll(MAX_ALLOWABLE_TIME_MILLIS, TimeUnit.MILLISECONDS);
         if (future == null) {
             throw new TimeoutException("timed out: "+CORRUPTED);
@@ -204,30 +260,36 @@ public class TestCorruptedFiles extends TikaTest {
     }
 
     private class ParseTask implements Callable<Boolean> {
-        private byte[] corrupted = null;
-        ParseTask(byte[] original, Random random) throws IOException {
-            corrupted = corrupt(new ByteArrayInputStream(original), random);
-            Files.delete(CORRUPTED);
-            OutputStream os = Files.newOutputStream(CORRUPTED, StandardOpenOption.CREATE);
-            IOUtils.copy(new ByteArrayInputStream(corrupted), os);
-            os.flush();
-            os.close();
+        protected byte[] bytes;
+        ParseTask(byte[] bytes) {
+            this.bytes = bytes;
         }
 
-
         @Override
         public Boolean call() throws Exception {
             try {
-                AUTO_DETECT_PARSER.parse(new ByteArrayInputStream(corrupted), new DefaultHandler(),
+                AUTO_DETECT_PARSER.parse(new ByteArrayInputStream(bytes), new DefaultHandler(),
                         new Metadata(), new ParseContext());
-            } catch (SAXException|TikaException|IOException e) {
-
+                //TODO: what else do we want to ignore?
+            } catch (SAXException|TikaException|IOException|AssertionError|IllegalArgumentException e) {
             }
             return true;
         }
+
+    }
+
+    private class CorruptAndParseTask extends ParseTask {
+        CorruptAndParseTask(byte[] original, Random random) throws IOException {
+            super(corrupt(new ByteArrayInputStream(original), random));
+            Files.delete(CORRUPTED);
+            OutputStream os = Files.newOutputStream(CORRUPTED, StandardOpenOption.CREATE);
+            IOUtils.copy(new ByteArrayInputStream(bytes), os);
+            os.flush();
+            os.close();
+        }
     }
 
-    private byte[] corrupt(InputStream is, Random random) throws IOException {
+    private static byte[] corrupt(InputStream is, Random random) throws IOException {
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
         IOUtils.copy(is, bos);
         byte[] bytes = bos.toByteArray();
@@ -248,15 +310,30 @@ public class TestCorruptedFiles extends TikaTest {
                 corrupted[i] = (byte) random.nextInt(255);
             }
             return corrupted;
-        } else {
+        } else if (random.nextInt(10000) <= PER_10000_SWAPPED) {
+            int srcStart = random.nextInt(bytes.length);
+            int destStart = random.nextInt(bytes.length);
+            int len = random.nextInt((int)((double)bytes.length/(double)4));
+            len = Math.max(srcStart, destStart) + len >= bytes.length ?
+                    bytes.length-Math.max(srcStart, destStart)-1 : len;
+
             byte[] corrupted = new byte[bytes.length];
-            for (int i = 0; i < bytes.length; i++) {
-                byte c = (random.nextInt(10000) < PER_10000_CORRUPTED) ?
-                        (byte) random.nextInt(255) : bytes[i];
-                corrupted[i] = c;
+            //first copy everything
+            System.arraycopy(bytes, 0, corrupted, 0, bytes.length);
+            System.arraycopy(bytes, srcStart, corrupted, destStart, len);
+            if (Arrays.equals(bytes, corrupted)) {
+                System.err.println("tried to swap, but bytes are identical");
             }
             return corrupted;
         }
+        byte[] corrupted = new byte[bytes.length];
+        for (int i = 0; i < bytes.length; i++) {
+            byte c = (random.nextInt(10000) < PER_10000_CORRUPTED) ?
+                    (byte) random.nextInt(255) : bytes[i];
+            corrupted[i] = c;
+        }
+        return corrupted;
+
     }