You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/01 16:18:48 UTC
[tika] branch 2.x updated: TIKA-2349 -- try to match embedded docs
by digest in tika-eval "Compare"
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new c67e622 TIKA-2349 -- try to match embedded docs by digest in tika-eval "Compare"
new 7c42589 Merge remote-tracking branch 'origin/2.x' into 2.x
c67e622 is described below
commit c67e6223650eb8f157865293679e6f3af920b826
Author: tballison <ta...@mitre.org>
AuthorDate: Mon May 1 12:18:20 2017 -0400
TIKA-2349 -- try to match embedded docs by digest in tika-eval "Compare"
---
.../java/org/apache/tika/eval/ExtractComparer.java | 49 +++++++++++++++++-----
.../org/apache/tika/eval/SimpleComparerTest.java | 17 ++++++++
.../extractsA/file14_diffAttachOrder.json | 18 ++++++++
.../extractsB/file14_diffAttachOrder.json | 17 ++++++++
4 files changed, 90 insertions(+), 11 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 7b006df..366cf38 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -43,6 +43,7 @@ import org.apache.tika.eval.tokens.ContrastStatistics;
import org.apache.tika.eval.tokens.TokenContraster;
import org.apache.tika.eval.tokens.TokenIntPair;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.RecursiveParserWrapper;
public class ExtractComparer extends AbstractProfiler {
@@ -97,6 +98,9 @@ public class ExtractComparer extends AbstractProfiler {
"Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
}
+ private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
private final static String FIELD_A = "fa";
private final static String FIELD_B = "fb";
@@ -402,27 +406,50 @@ public class ExtractComparer extends AbstractProfiler {
if (metadataListB == null || metadataListB.size() == 0) {
return -1;
}
+ //assume first is always the container file
if (i == 0) {
return 0;
}
- if (metadataListA.size() == metadataListB.size()) {
- //assume no rearrangments if lists are the same size
- return i;
+
+ //first try to find matching digests
+ //this does not elegantly handle multiple matching digests
+ int match = findMatchingDigests(metadataListA.get(i), metadataListB);
+ if (match > -1) {
+ return match;
}
+ //assume same embedded resource path. Not always true!
Metadata thisMetadata = metadataListA.get(i);
String embeddedPath = thisMetadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
- if (embeddedPath == null) {
- return -1;
+ if (embeddedPath != null) {
+ for (int j = 0; j < metadataListB.size(); j++) {
+ String thatEmbeddedPath = metadataListB.get(j).get(
+ RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+ if (embeddedPath.equals(thatEmbeddedPath)) {
+ return j;
+ }
+ }
}
- if (i < metadataListB.size()) {
+
+ //last resort, if lists are same size, guess the same index
+ if (metadataListA.size() == metadataListB.size()) {
+ //assume no rearrangments if lists are the same size
+ return i;
}
+ return -1;
+ }
- for (int j = 0; j < metadataListB.size(); j++) {
- String thatEmbeddedPath = metadataListB.get(j).get(
- RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
- if (embeddedPath.equals(thatEmbeddedPath)) {
- return j;
+ private int findMatchingDigests(Metadata metadata, List<Metadata> metadataListB) {
+ Set<String> digestKeys = new HashSet<>();
+ for (String n : metadata.names()) {
+ if (n.startsWith(DIGEST_KEY_PREFIX)) {
+ String digestA = metadata.get(n);
+ for (int i = 0; i < metadataListB.size(); i++) {
+ String digestB = metadataListB.get(i).get(n);
+ if (digestA != null && digestA.equals(digestB)) {
+ return i;
+ }
+ }
}
}
return -1;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 257a607..ea516c6 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -264,6 +264,23 @@ public class SimpleComparerTest extends TikaTest {
assertEquals(expected, counts);
}
+ @Test
+ public void testDifferentlyOrderedAttachments() throws Exception {
+ EvalFilePaths fpsA = new EvalFilePaths(
+ Paths.get("file14_diffAttachOrder.json"),
+ getResourceAsFile("/test-dirs/extractsA/file14_diffAttachOrder.json").toPath()
+ );
+ EvalFilePaths fpsB = new EvalFilePaths(
+ Paths.get("file6_accessEx.pdf.json"),
+ getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()
+ );
+ comparer.compareFiles(fpsA, fpsB);
+ List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+ assertEquals(3, tableInfos.size());
+ for (int i = 0; i < tableInfos.size(); i++) {
+ assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
+ }
+ }
@Test
@Ignore
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json b/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
new file mode 100644
index 0000000..fb16381
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
@@ -0,0 +1,18 @@
+[{
+ "Content-Type":"text/plain",
+ "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+},
+ {
+ "Content-Type":"text/plain",
+ "X-TIKA:embedded_resource_path":"/0",
+ "X-TIKA:content":"a b c d",
+ "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8354"
+ },
+ {
+ "Content-Type":"text/plain",
+ "X-TIKA:embedded_resource_path":"/1",
+ "X-TIKA:content":"e f g",
+ "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8353"
+ }
+
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json b/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
new file mode 100644
index 0000000..edd0a69
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
@@ -0,0 +1,17 @@
+[{
+ "Content-Type":"text/plain",
+ "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+},
+ {
+ "Content-Type":"text/plain",
+ "X-TIKA:embedded_resource_path":"inner2.txt",
+ "X-TIKA:content":"e f g",
+ "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8353"
+ },
+ {
+ "Content-Type":"text/plain",
+ "X-TIKA:embedded_resource_path":"inner1.txt",
+ "X-TIKA:content":"a b c d",
+ "X-TIKA:digest:MD5":"471d98383e9f40444e5ecf821f2c8354"
+ }
+]
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].