You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/08/04 14:35:20 UTC
[tika] 01/02: TIKA-2242 -- fix buggy xhtml element ordering/tags in
ODT parser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6dfd3f6a3e043d020f3ca1f6880dd9a28f3c5a7d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 4 10:21:55 2021 -0400
TIKA-2242 -- fix buggy xhtml element ordering/tags in ODT parser
---
CHANGES.txt | 2 +
.../tika/parser/odf/OpenDocumentBodyHandler.java | 40 ++++++++----
.../org/apache/tika/parser/odf/ODFParserTest.java | 39 ++++++++++-
.../parser/odf/OpenDocumentContentParserTest.java | 72 +++++++++++++++++++++
.../test-documents/testODTBodyListOpenClose.xml.gz | Bin 0 -> 6490 bytes
5 files changed, 140 insertions(+), 13 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6898dee..041e27f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.0.1 - ???
+ * Fix markup ordering errors in xhtml output for ODT files (TIKA-2242).
+
* Fix serialization of embedded docs in OpenSearch emitter
and fix embedded documents not being indexed in some use
cases in the Solr emitter (TIKA-3490).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 88be59b..792bcaa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -61,6 +61,8 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
protected static final char[] TAB = new char[]{'\t'};
private static final String BINARY_DATA = "binary-data";
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+ private static final NullListStyle NULL_LIST_STYLE = new NullListStyle();
+
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
@@ -219,18 +221,20 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
}
private void startList(String name) throws SAXException {
- String elementName = "ul";
- if (name != null) {
- ListStyle style = listStyleMap.get(name);
- elementName = style != null ? style.getTag() : "ul";
- listStyleStack.push(style);
+ ListStyle style = null;
+ if (name == null || ! listStyleMap.containsKey(name)) {
+ style = NULL_LIST_STYLE;
+ } else {
+ style = listStyleMap.get(name);
}
+ String elementName = style.getTag();
+ listStyleStack.push(style);
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
- if (!listStyleStack.isEmpty()) {
+ if (! listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
@@ -426,13 +430,13 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
- handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
+ handler.startElement(XHTML, "p", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
- handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
+ handler.startElement(XHTML, "p", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
- handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
+ handler.startElement(XHTML, "p", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
@@ -471,6 +475,7 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
+ closeStyleTags();
handler.endElement(namespaceURI, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
@@ -482,7 +487,10 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
- handler.endElement(namespaceURI, localName, localName);
+ handler.endElement(namespaceURI, "p", "p");
+ } else if ("a".equals(localName)) {
+ closeStyleTags();
+ super.endElement(namespaceURI, localName, qName);
} else {
super.endElement(namespaceURI, localName, qName);
}
@@ -554,11 +562,21 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
private static class ListStyle implements Style {
public boolean ordered;
-
public String getTag() {
return ordered ? "ol" : "ul";
}
}
+ private static class NullListStyle extends ListStyle {
+ NullListStyle() {
+ ordered = false;
+ }
+
+ @Override
+ public String getTag() {
+ return "ul";
+ }
+ }
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index d433465..27c3848 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -19,9 +19,12 @@ package org.apache.tika.parser.odf;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
@@ -33,6 +36,7 @@ import java.util.concurrent.Future;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
@@ -47,6 +51,7 @@ import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
public class ODFParserTest extends TikaTest {
/**
@@ -348,8 +353,8 @@ public class ODFParserTest extends TikaTest {
//not allowed in html: <p> <annotation> <p> this is an annotation </p> </annotation> </p>
String xml = getXML("testODTStyles3.odt").xml;
assertContains(
- "<p><b>WOUTERS Rolf</b><span class=\"annotation\"> Beschermde persoon is " +
- "overleden </annotation>",
+ "<p><b>WOUTERS Rolf</b><p class=\"annotation\"> Beschermde persoon is " +
+ "overleden </p>",
xml);
}
@@ -455,4 +460,34 @@ public class ODFParserTest extends TikaTest {
executorService.shutdownNow();
}
}
+
+ @Test
+ public void testODTXHTMLIsParseable() throws Exception {
+ //for all OpenDocument files, make sure that the
+ //output from the parse is parseable xhtml
+ int filesTested = 0;
+ for (Path p : getAllTestFiles()) {
+ String fileName = p.getFileName().toString();
+ if (fileName.endsWith(".odt") || fileName.endsWith("odp") || fileName.endsWith("odf") ||
+ fileName.endsWith(".ods")) {
+
+ XMLResult xmlResult = null;
+ try (InputStream is = TikaInputStream.get(p)) {
+ xmlResult = getXML(is, AUTO_DETECT_PARSER, new Metadata());
+ } catch (Exception e) {
+ continue;
+ }
+ try {
+ //just make sure this doesn't throw any exceptions
+ XMLReaderUtils.parseSAX(new ByteArrayInputStream(xmlResult.xml.getBytes(StandardCharsets.UTF_8)),
+ new DefaultHandler(), new ParseContext());
+ filesTested++;
+ } catch (Exception e) {
+ fail(p.getFileName().toString(), e);
+ }
+ }
+ }
+ assertTrue(filesTested > 10);
+ }
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/OpenDocumentContentParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/OpenDocumentContentParserTest.java
new file mode 100644
index 0000000..586b00a
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/OpenDocumentContentParserTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class OpenDocumentContentParserTest extends TikaTest {
+
+ @Test
+ public void testEmbeddedLists() throws Exception {
+ Parser p = new OpenDocumentContentParser();
+ Metadata metadata = new Metadata();
+ XMLResult r = null;
+ try (InputStream is = new GzipCompressorInputStream(getResourceAsStream(
+ "/test-documents/testODTBodyListOpenClose.xml.gz"))) {
+ r = getXML(is, p, metadata);
+ }
+ String xml = r.xml;
+ //extract all the tags
+ Matcher m = Pattern.compile("(<[^>]+>)").matcher(xml);
+ StringBuilder sb = new StringBuilder();
+ while (m.find()) {
+ sb.append(m.group(1)).append(" ");
+ }
+ String tags = sb.toString();
+ assertContains("</p> <ol> <li> <p> </p> <ul> <li> <p> </p> </li> <li> <p> </p> </li> " +
+ "</ul> </li> </ol>",
+ tags);
+ assertContains("Exercice", xml);
+
+ m = Pattern.compile("<p class=\"annotation\">[^<]+<\\/p>").matcher(xml);
+ assertTrue(m.find());
+
+ m = Pattern.compile("<p class=\"annotation\">[^<]+<\\/annotation>").matcher(xml);
+ assertFalse(m.find());
+
+ //just make sure this doesn't throw any exceptions
+ XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new DefaultHandler(), new ParseContext());
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTBodyListOpenClose.xml.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTBodyListOpenClose.xml.gz
new file mode 100644
index 0000000..6dfcd1b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTBodyListOpenClose.xml.gz differ