You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/08/04 14:35:20 UTC

[tika] 01/02: TIKA-2242 -- fix buggy xhtml element ordering/tags in ODT parser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6dfd3f6a3e043d020f3ca1f6880dd9a28f3c5a7d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 4 10:21:55 2021 -0400

    TIKA-2242 -- fix buggy xhtml element ordering/tags in ODT parser
---
 CHANGES.txt                                        |   2 +
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  40 ++++++++----
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  39 ++++++++++-
 .../parser/odf/OpenDocumentContentParserTest.java  |  72 +++++++++++++++++++++
 .../test-documents/testODTBodyListOpenClose.xml.gz | Bin 0 -> 6490 bytes
 5 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6898dee..041e27f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.0.1 - ???
 
+   * Fix markup ordering errors in xhtml output for ODT files (TIKA-2242).
+
    * Fix serialization of embedded docs in OpenSearch emitter
      and fix embedded documents not being indexed in some use
      cases in the Solr emitter (TIKA-3490).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 88be59b..792bcaa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -61,6 +61,8 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
     protected static final char[] TAB = new char[]{'\t'};
     private static final String BINARY_DATA = "binary-data";
     private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+    private static final NullListStyle NULL_LIST_STYLE = new NullListStyle();
+
     /**
      * Mappings between ODF tag names and XHTML tag names
      * (including attributes). All other tag names/attributes are ignored
@@ -219,18 +221,20 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
     }
 
     private void startList(String name) throws SAXException {
-        String elementName = "ul";
-        if (name != null) {
-            ListStyle style = listStyleMap.get(name);
-            elementName = style != null ? style.getTag() : "ul";
-            listStyleStack.push(style);
+        ListStyle style = null;
+        if (name == null || ! listStyleMap.containsKey(name)) {
+            style = NULL_LIST_STYLE;
+        } else {
+            style = listStyleMap.get(name);
         }
+        String elementName = style.getTag();
+        listStyleStack.push(style);
         handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
     }
 
     private void endList() throws SAXException {
         String elementName = "ul";
-        if (!listStyleStack.isEmpty()) {
+        if (! listStyleStack.isEmpty()) {
             ListStyle style = listStyleStack.pop();
             elementName = style != null ? style.getTag() : "ul";
         }
@@ -426,13 +430,13 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
                 handler.characters(SPACE, 0, 1);
             } else if ("annotation".equals(localName)) {
                 closeStyleTags();
-                handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
+                handler.startElement(XHTML, "p", "p", ANNOTATION_ATTRIBUTES);
             } else if ("note".equals(localName)) {
                 closeStyleTags();
-                handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
+                handler.startElement(XHTML, "p", "p", NOTE_ATTRIBUTES);
             } else if ("notes".equals(localName)) {
                 closeStyleTags();
-                handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
+                handler.startElement(XHTML, "p", "p", NOTES_ATTRIBUTES);
             } else {
                 super.startElement(namespaceURI, localName, qName, attrs);
             }
@@ -471,6 +475,7 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
             // to incoming handler
             if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                 final String el = headingStack.pop();
+                closeStyleTags();
                 handler.endElement(namespaceURI, el, el);
             } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
                 endList();
@@ -482,7 +487,10 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
             } else if ("annotation".equals(localName) || "note".equals(localName) ||
                     "notes".equals(localName)) {
                 closeStyleTags();
-                handler.endElement(namespaceURI, localName, localName);
+                handler.endElement(namespaceURI, "p", "p");
+            } else if ("a".equals(localName)) {
+                closeStyleTags();
+                super.endElement(namespaceURI, localName, qName);
             } else {
                 super.endElement(namespaceURI, localName, qName);
             }
@@ -554,11 +562,21 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
 
     private static class ListStyle implements Style {
         public boolean ordered;
-
         public String getTag() {
             return ordered ? "ol" : "ul";
         }
     }
 
+    private static class NullListStyle extends ListStyle {
+        NullListStyle() {
+            ordered = false;
+        }
+
+        @Override
+        public String getTag() {
+            return "ul";
+        }
+    }
+
 
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index d433465..27c3848 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -19,9 +19,12 @@ package org.apache.tika.parser.odf;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
@@ -33,6 +36,7 @@ import java.util.concurrent.Future;
 
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.EncryptedDocumentException;
@@ -47,6 +51,7 @@ import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 public class ODFParserTest extends TikaTest {
     /**
@@ -348,8 +353,8 @@ public class ODFParserTest extends TikaTest {
         //not allowed in html: <p> <annotation> <p> this is an annotation </p> </annotation> </p>
         String xml = getXML("testODTStyles3.odt").xml;
         assertContains(
-                "<p><b>WOUTERS Rolf</b><span class=\"annotation\"> Beschermde persoon is " +
-                        "overleden </annotation>",
+                "<p><b>WOUTERS Rolf</b><p class=\"annotation\"> Beschermde persoon is " +
+                        "overleden </p>",
                 xml);
     }
 
@@ -455,4 +460,34 @@ public class ODFParserTest extends TikaTest {
             executorService.shutdownNow();
         }
     }
+
+    @Test
+    public void testODTXHTMLIsParseable() throws Exception {
+        //for all OpenDocument files, make sure that the
+        //output from the parse is parseable xhtml
+        int filesTested = 0;
+        for (Path p : getAllTestFiles()) {
+            String fileName = p.getFileName().toString();
+            if (fileName.endsWith(".odt") || fileName.endsWith("odp") || fileName.endsWith("odf") ||
+                    fileName.endsWith(".ods")) {
+
+                XMLResult xmlResult = null;
+                try (InputStream is = TikaInputStream.get(p)) {
+                    xmlResult = getXML(is, AUTO_DETECT_PARSER, new Metadata());
+                } catch (Exception e) {
+                    continue;
+                }
+                try {
+                    //just make sure this doesn't throw any exceptions
+                    XMLReaderUtils.parseSAX(new ByteArrayInputStream(xmlResult.xml.getBytes(StandardCharsets.UTF_8)),
+                            new DefaultHandler(), new ParseContext());
+                    filesTested++;
+                } catch (Exception e) {
+                    fail(p.getFileName().toString(), e);
+                }
+            }
+        }
+        assertTrue(filesTested > 10);
+    }
+
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/OpenDocumentContentParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/OpenDocumentContentParserTest.java
new file mode 100644
index 0000000..586b00a
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/OpenDocumentContentParserTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class OpenDocumentContentParserTest extends TikaTest {
+
+    @Test
+    public void testEmbeddedLists() throws Exception {
+        Parser p = new OpenDocumentContentParser();
+        Metadata metadata = new Metadata();
+        XMLResult r = null;
+        try (InputStream is = new GzipCompressorInputStream(getResourceAsStream(
+                "/test-documents/testODTBodyListOpenClose.xml.gz"))) {
+            r = getXML(is, p, metadata);
+        }
+        String xml = r.xml;
+        //extract all the tags
+        Matcher m = Pattern.compile("(<[^>]+>)").matcher(xml);
+        StringBuilder sb = new StringBuilder();
+        while (m.find()) {
+            sb.append(m.group(1)).append(" ");
+        }
+        String tags = sb.toString();
+        assertContains("</p> <ol> <li> <p> </p> <ul> <li> <p> </p> </li> <li> <p> </p> </li> " +
+                        "</ul> </li> </ol>",
+                tags);
+        assertContains("Exercice", xml);
+
+        m = Pattern.compile("<p class=\"annotation\">[^<]+<\\/p>").matcher(xml);
+        assertTrue(m.find());
+
+        m = Pattern.compile("<p class=\"annotation\">[^<]+<\\/annotation>").matcher(xml);
+        assertFalse(m.find());
+
+        //just make sure this doesn't throw any exceptions
+        XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                new DefaultHandler(), new ParseContext());
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTBodyListOpenClose.xml.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTBodyListOpenClose.xml.gz
new file mode 100644
index 0000000..6dfcd1b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTBodyListOpenClose.xml.gz differ