You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/06 19:41:58 UTC

[tika] 03/03: TIKA-3062 -- improve alignment of attachments based on digests

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit dc23e88e431cfa00616b41791b7b5e0df873dd4e
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 6 14:41:33 2020 -0500

    TIKA-3062 -- improve alignment of attachments based on digests
---
 bin/jhove-apps-1.22.0.jar                          | Bin 0 -> 321216 bytes
 .../org/apache/tika/eval/AbstractProfiler.java     |   8 +-
 .../java/org/apache/tika/eval/ExtractComparer.java | 121 ++++++++++++++-------
 .../eval/textstats/ContentLengthCalculator.java    |   2 +-
 .../org/apache/tika/eval/SimpleComparerTest.java   |   3 +-
 .../extractsA/file14_diffAttachOrder.json          |  27 ++---
 .../extractsB/file14_diffAttachOrder.json          |  26 +++--
 7 files changed, 116 insertions(+), 71 deletions(-)

diff --git a/bin/jhove-apps-1.22.0.jar b/bin/jhove-apps-1.22.0.jar
new file mode 100644
index 0000000..a90b104
Binary files /dev/null and b/bin/jhove-apps-1.22.0.jar differ
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index e23361b..b54b242 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -390,9 +390,11 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         Map<Cols, String> data = new HashMap<>();
         data.put(Cols.ID, fileId);
         if (textStats.containsKey(ContentLengthCalculator.class)) {
-            data.put(Cols.CONTENT_LENGTH, Integer.toString((Integer) textStats.get(ContentLengthCalculator.class)));
-        } else {
-            data.put(Cols.CONTENT_LENGTH, "0");
+            int length = (int)textStats.get(ContentLengthCalculator.class);
+            if (length == 0) {
+                return;
+            }
+            data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
         }
         langid(textStats, data);
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 39342ef..8eca1c9 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -277,6 +277,8 @@ public class ExtractComparer extends AbstractProfiler {
         }
         List<Integer> numAttachmentsA = countAttachments(metadataListA);
         List<Integer> numAttachmentsB = countAttachments(metadataListB);
+
+        String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
         Map<Class, Object> tokenStatsA = null;
         Map<Class, Object> tokenStatsB = null;
         //now get that metadata
@@ -294,7 +296,8 @@ public class ExtractComparer extends AbstractProfiler {
 
                 writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
                 writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
-                int matchIndex = getMatch(i, metadataListA, metadataListB);
+                int matchIndex = getMatch(i, sharedDigestKey,
+                        handledB, metadataListA, metadataListB);
 
                 if (matchIndex > -1 && ! handledB.contains(matchIndex)) {
                     metadataB = metadataListB.get(matchIndex);
@@ -310,28 +313,33 @@ public class ExtractComparer extends AbstractProfiler {
                 //write content
                 try {
                     tokenStatsA = calcTextStats(contentTagsA);
-                    tokenStatsB = calcTextStats(contentTagsB);
                     writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
-                    writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
+                    tokenStatsB = calcTextStats(contentTagsB);
+                    if (metadataB != null) {
+                        writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
+                    }
                 } catch (IOException e) {
                     throw new RuntimeException(e);
                 }
-
-                TokenCounts tokenCountsA = (TokenCounts)tokenStatsA.get(BasicTokenCountStatsCalculator.class);
-                TokenCounts tokenCountsB = (TokenCounts)tokenStatsB.get(BasicTokenCountStatsCalculator.class);
-                //now run comparisons
-                if (tokenCountsA.getTotalTokens() > 0
-                        && tokenCountsB.getTotalTokens() > 0) {
-                    Map<Cols, String> data = new HashMap<>();
-                    data.put(Cols.ID, fileId);
-
-                    ContrastStatistics contrastStatistics =
-                            tokenContraster.calculateContrastStatistics(
-                            tokenCountsA,
-                            tokenCountsB);
-
-                    writeContrasts(data, contrastStatistics);
-                    writer.writeRow(CONTENT_COMPARISONS, data);
+                if (metadataB != null) {
+                    TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
+                    TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
+                    //arbitrary decision...only run the comparisons if there are > 10 tokens total
+                    //We may want to bump that value a bit higher?
+                    //now run comparisons
+                    if (tokenCountsA.getTotalTokens()
+                            + tokenCountsB.getTotalTokens() > 10) {
+                        Map<Cols, String> data = new HashMap<>();
+                        data.put(Cols.ID, fileId);
+
+                        ContrastStatistics contrastStatistics =
+                                tokenContraster.calculateContrastStatistics(
+                                        tokenCountsA,
+                                        tokenCountsB);
+
+                        writeContrasts(data, contrastStatistics);
+                        writer.writeRow(CONTENT_COMPARISONS, data);
+                    }
                 }
             }
         }
@@ -362,6 +370,34 @@ public class ExtractComparer extends AbstractProfiler {
         }
     }
 
+    /**
+     * Checks only the first item in each list. Returns the first
+     * digest key shared by both, if it exists, null otherwise.
+     * @param metadataListA
+     * @param metadataListB
+     * @return
+     */
+    private String findSharedDigestKey(List<Metadata> metadataListA, List<Metadata> metadataListB) {
+        if (metadataListB == null || metadataListB.size() == 0) {
+            return null;
+        }
+        Set<String> digestA = new HashSet<>();
+        if (metadataListA != null) {
+            for (String n : metadataListA.get(0).names()) {
+                if (n.startsWith(DIGEST_KEY_PREFIX)) {
+                    digestA.add(n);
+                }
+            }
+        }
+        Metadata bMain = metadataListB.get(0);
+        for (String n : bMain.names()) {
+            if (digestA.contains(n)) {
+                return n;
+            }
+        }
+        return null;
+    }
+
     private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) {
         //container file, don't write anything
         if (i == 0) {
@@ -411,12 +447,12 @@ public class ExtractComparer extends AbstractProfiler {
      * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH
      * If you can't find it, return -1;
      *
-     * @param i                index for match in metadataListA
+     * @param aIndex                index for match in metadataListA
      * @param metadataListA
      * @param metadataListB
      * @return
      */
-    private int getMatch(int i,
+    private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> handledB,
                          List<Metadata> metadataListA,
                          List<Metadata> metadataListB) {
         //TODO: could make this more robust
@@ -424,19 +460,19 @@ public class ExtractComparer extends AbstractProfiler {
             return -1;
         }
         //assume first is always the container file
-        if (i == 0) {
+        if (aIndex == 0) {
             return 0;
         }
 
-        //first try to find matching digests
-        //this does not elegantly handle multiple matching digests
-        int match = findMatchingDigests(metadataListA.get(i), metadataListB);
-        if (match > -1) {
-            return match;
+        if (sharedDigestKey != null) {
+            //first try to find matching digests
+            //this does not elegantly handle multiple matching digests
+            return findMatchingDigests(sharedDigestKey, handledB,
+                    metadataListA.get(aIndex), metadataListB);
         }
 
         //assume same embedded resource path.  Not always true!
-        Metadata thisMetadata = metadataListA.get(i);
+        Metadata thisMetadata = metadataListA.get(aIndex);
         String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
         if (embeddedPath != null) {
             for (int j = 0; j < metadataListB.size(); j++) {
@@ -451,22 +487,27 @@ public class ExtractComparer extends AbstractProfiler {
         //last resort, if lists are same size, guess the same index
         if (metadataListA.size() == metadataListB.size()) {
             //assume no rearrangments if lists are the same size
-            return i;
+            return aIndex;
         }
         return -1;
     }
 
-    private int findMatchingDigests(Metadata metadata, List<Metadata> metadataListB) {
-        Set<String> digestKeys = new HashSet<>();
-        for (String n : metadata.names()) {
-            if (n.startsWith(DIGEST_KEY_PREFIX)) {
-                String digestA = metadata.get(n);
-                for (int i = 0; i < metadataListB.size(); i++) {
-                    String digestB = metadataListB.get(i).get(n);
-                    if (digestA != null && digestA.equals(digestB)) {
-                        return i;
-                    }
-                }
+    private int findMatchingDigests(String sharedDigestKey,
+                                    Set<Integer> handledB,
+                                    Metadata metadata, List<Metadata> metadataListB) {
+        String digestA = metadata.get(sharedDigestKey);
+        if (digestA == null) {
+            return -1;
+        }
+
+        for (int i = 0; i < metadataListB.size(); i++)  {
+            if (handledB.contains(i)) {
+                continue;
+            }
+            Metadata mB = metadataListB.get(i);
+            String digestB = mB.get(sharedDigestKey);
+            if (digestA.equalsIgnoreCase(digestB)) {
+                return i;
             }
         }
         return -1;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/ContentLengthCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/ContentLengthCalculator.java
index 3eb0f23..27c335a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/ContentLengthCalculator.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/ContentLengthCalculator.java
@@ -19,6 +19,6 @@ package org.apache.tika.eval.textstats;
 public class ContentLengthCalculator implements StringStatsCalculator<Integer> {
     @Override
     public Integer calculate(String txt) {
-        return txt.length();
+        return txt.trim().length();
     }
 }
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 78b8b42..bae5792 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -281,7 +281,7 @@ public class SimpleComparerTest extends TikaTest {
         List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
         assertEquals(3, tableInfos.size());
         for (int i = 0; i < tableInfos.size(); i++) {
-            assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
+            assertEquals("problem with "+i, "1.0", tableInfos.get(i).get(Cols.OVERLAP));
         }
     }
 
@@ -439,6 +439,5 @@ public class SimpleComparerTest extends TikaTest {
                 ExtractComparer.CONTENT_COMPARISONS}) {
             debugPrintTable(t);
         }
-
     }
 }
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json b/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
index fb16381..7801ab8 100644
--- a/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
@@ -1,18 +1,19 @@
-[{
-  "Content-Type":"text/plain",
-  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
-},
+[
   {
-    "Content-Type":"text/plain",
-    "X-TIKA:embedded_resource_path":"/0",
-    "X-TIKA:content":"a b c d",
-    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8354"
+    "Content-Type": "text/plain",
+    "X-TIKA:content": "the quick brown fox fox fox jumped over the lazy lazy dog",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8351"
   },
   {
-    "Content-Type":"text/plain",
-    "X-TIKA:embedded_resource_path":"/1",
-    "X-TIKA:content":"e f g",
-    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8353"
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "/0",
+    "X-TIKA:content": "a b c d e f g h i j k l m n",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "/1",
+    "X-TIKA:content": "o p q r s t u v w x y z",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353"
   }
-
 ]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json b/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
index edd0a69..e28bace 100644
--- a/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
@@ -1,17 +1,19 @@
-[{
-  "Content-Type":"text/plain",
-  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
-},
+[
   {
-    "Content-Type":"text/plain",
-    "X-TIKA:embedded_resource_path":"inner2.txt",
-    "X-TIKA:content":"e f g",
-    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8353"
+    "Content-Type": "text/plain",
+    "X-TIKA:content": "the quick brown fox fox fox jumped over the lazy lazy dog",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8351"
   },
   {
-    "Content-Type":"text/plain",
-    "X-TIKA:embedded_resource_path":"inner1.txt",
-    "X-TIKA:content":"a b c d",
-    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8354"
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "inner2.txt",
+    "X-TIKA:content": "o p q r s t u v w x y z",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "inner1.txt",
+    "X-TIKA:content": "a b c d e f g h i j k l m n",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354"
   }
 ]
\ No newline at end of file