You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/01 16:18:48 UTC

[tika] branch 2.x updated: TIKA-2349 -- try to match embedded docs by digest in tika-eval "Compare"

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  c67e622   TIKA-2349 -- try to match embedded docs by digest in tika-eval "Compare"
       new  7c42589   Merge remote-tracking branch 'origin/2.x' into 2.x
c67e622 is described below

commit c67e6223650eb8f157865293679e6f3af920b826
Author: tballison <ta...@mitre.org>
AuthorDate: Mon May 1 12:18:20 2017 -0400

    TIKA-2349 -- try to match embedded docs by digest in tika-eval "Compare"
---
 .../java/org/apache/tika/eval/ExtractComparer.java | 49 +++++++++++++++++-----
 .../org/apache/tika/eval/SimpleComparerTest.java   | 17 ++++++++
 .../extractsA/file14_diffAttachOrder.json          | 18 ++++++++
 .../extractsB/file14_diffAttachOrder.json          | 17 ++++++++
 4 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 7b006df..366cf38 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -43,6 +43,7 @@ import org.apache.tika.eval.tokens.ContrastStatistics;
 import org.apache.tika.eval.tokens.TokenContraster;
 import org.apache.tika.eval.tokens.TokenIntPair;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.RecursiveParserWrapper;
 
 public class ExtractComparer extends AbstractProfiler {
@@ -97,6 +98,9 @@ public class ExtractComparer extends AbstractProfiler {
                 "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
     }
 
+    private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX+
+            "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
     private final static String FIELD_A = "fa";
     private final static String FIELD_B = "fb";
 
@@ -402,27 +406,50 @@ public class ExtractComparer extends AbstractProfiler {
         if (metadataListB == null || metadataListB.size() == 0) {
             return -1;
         }
+        //assume first is always the container file
         if (i == 0) {
             return 0;
         }
-        if (metadataListA.size() == metadataListB.size()) {
-            //assume no rearrangments if lists are the same size
-            return i;
+
+        //first try to find matching digests
+        //this does not elegantly handle multiple matching digests
+        int match = findMatchingDigests(metadataListA.get(i), metadataListB);
+        if (match > -1) {
+            return match;
         }
 
+        //assume same embedded resource path.  Not always true!
         Metadata thisMetadata = metadataListA.get(i);
         String embeddedPath = thisMetadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
-        if (embeddedPath == null) {
-            return -1;
+        if (embeddedPath != null) {
+            for (int j = 0; j < metadataListB.size(); j++) {
+                String thatEmbeddedPath = metadataListB.get(j).get(
+                        RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+                if (embeddedPath.equals(thatEmbeddedPath)) {
+                    return j;
+                }
+            }
         }
-        if (i < metadataListB.size()) {
+
+        //last resort, if lists are same size, guess the same index
+        if (metadataListA.size() == metadataListB.size()) {
+            //assume no rearrangments if lists are the same size
+            return i;
         }
+        return -1;
+    }
 
-        for (int j = 0; j < metadataListB.size(); j++) {
-            String thatEmbeddedPath = metadataListB.get(j).get(
-                    RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
-            if (embeddedPath.equals(thatEmbeddedPath)) {
-                return j;
+    private int findMatchingDigests(Metadata metadata, List<Metadata> metadataListB) {
+        Set<String> digestKeys = new HashSet<>();
+        for (String n : metadata.names()) {
+            if (n.startsWith(DIGEST_KEY_PREFIX)) {
+                String digestA = metadata.get(n);
+                for (int i = 0; i < metadataListB.size(); i++) {
+                    String digestB = metadataListB.get(i).get(n);
+                    if (digestA != null && digestA.equals(digestB)) {
+                        return i;
+                    }
+                }
             }
         }
         return -1;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 257a607..ea516c6 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -264,6 +264,23 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals(expected, counts);
     }
 
+    @Test
+    public void testDifferentlyOrderedAttachments() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file14_diffAttachOrder.json"),
+                getResourceAsFile("/test-dirs/extractsA/file14_diffAttachOrder.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file6_accessEx.pdf.json"),
+                getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()
+        );
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+        assertEquals(3, tableInfos.size());
+        for (int i = 0; i < tableInfos.size(); i++) {
+            assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
+        }
+    }
 
     @Test
     @Ignore
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json b/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
new file mode 100644
index 0000000..fb16381
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
@@ -0,0 +1,18 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+},
+  {
+    "Content-Type":"text/plain",
+    "X-TIKA:embedded_resource_path":"/0",
+    "X-TIKA:content":"a b c d",
+    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8354"
+  },
+  {
+    "Content-Type":"text/plain",
+    "X-TIKA:embedded_resource_path":"/1",
+    "X-TIKA:content":"e f g",
+    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8353"
+  }
+
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json b/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
new file mode 100644
index 0000000..edd0a69
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
@@ -0,0 +1,17 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+},
+  {
+    "Content-Type":"text/plain",
+    "X-TIKA:embedded_resource_path":"inner2.txt",
+    "X-TIKA:content":"e f g",
+    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8353"
+  },
+  {
+    "Content-Type":"text/plain",
+    "X-TIKA:embedded_resource_path":"inner1.txt",
+    "X-TIKA:content":"a b c d",
+    "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8354"
+  }
+]
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].