You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/07 19:55:19 UTC
[tika] 02/02: TIKA-2810 -- handle bad tags more robustly
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 517adc9d7056e90f48d132e27bfe3f44b8453338
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jan 7 14:55:05 2019 -0500
TIKA-2810 -- handle bad tags more robustly
---
CHANGES.txt | 3 +++
.../src/main/java/org/apache/tika/eval/AbstractProfiler.java | 11 +++++++++--
.../src/main/java/org/apache/tika/eval/util/ContentTags.java | 6 +++++-
.../test/java/org/apache/tika/eval/SimpleComparerTest.java | 10 ++++++++++
4 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 938e0ea..df2559b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,9 @@ Release 2.0.0 - ???
Release 1.21 - ????
+
+ * Handle bad tags in tika-eval more robustly (TIKA-2810).
+
* Add reports for tags in tika-eval (TIKA-2809).
* Extract text from SDT element within textboxes in .docx files (TIKA-2807).
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index e15dc13..3a633f7 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -847,9 +847,16 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
try {
return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
} catch (TikaException|IOException|SAXException e) {
- LOG.warn("Problem parsing xhtml in {}; backing off to treat string as text",
+ LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
-
+ try {
+ ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+ contentTags.setParseException(true);
+ return contentTags;
+ } catch (IOException|SAXException e2) {
+ LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+ evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
+ }
return new ContentTags(s, true);
}
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
index 115976f..3f8c9a5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
@@ -24,7 +24,7 @@ public class ContentTags {
public static final ContentTags EMPTY_CONTENT_TAGS = new ContentTags();
final Map<String, Integer> tags;
final String content;
- final boolean parseException;
+ boolean parseException;
private ContentTags() {
this("", Collections.EMPTY_MAP, false);
@@ -60,4 +60,8 @@ public class ContentTags {
public boolean getParseException() {
return parseException;
}
+
+ public void setParseException(boolean parseException) {
+ this.parseException = parseException;
+ }
}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index d54d41c..96286be 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -346,6 +346,16 @@ public class SimpleComparerTest extends TikaTest {
assertEquals(1, tableInfosA.size());
Map<Cols, String> tableInfoA = tableInfosA.get(0);
assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+
+ //confirm that backoff to html parser worked
+ List<Map<Cols, String>> contentsA = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
+ assertEquals(1, contentsA.size());
+ Map<Cols, String> contentsARow1 = contentsA.get(0);
+ String topN = contentsARow1.get(Cols.TOP_N_TOKENS);
+ assertNotContained("content:", topN);
+ assertNotContained(" p: ", topN);
+ assertContains("apache: 12", topN);
+
}
@Test