You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2015/03/03 03:18:54 UTC

svn commit: r1663513 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/sax/XHTMLContentHandler.java test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java

Author: tpalsulich
Date: Tue Mar  3 02:18:53 2015
New Revision: 1663513

URL: http://svn.apache.org/r1663513
Log:
TIKA-995. Properly output XHTML body attributes, contributed by Markus Jelsma.

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=1663513&r1=1663512&r2=1663513&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Tue Mar  3 02:18:53 2015
@@ -60,7 +60,7 @@ public class XHTMLContentHandler extends
      * skip them if they get sent to startElement/endElement by mistake.
      */
     private static final Set<String> AUTO =
-        unmodifiableSet("html", "head", "body", "frameset");
+        unmodifiableSet("html", "head", "frameset");
 
     /**
      * The elements that get prepended with the {@link #TAB} character.

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=1663513&r1=1663512&r2=1663513&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java Tue Mar  3 02:18:53 2015
@@ -17,10 +17,12 @@
 package org.apache.tika.sax;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.tika.config.TikaConfigTest;
 import org.apache.tika.metadata.Metadata;
 
 import org.junit.Before;
@@ -28,6 +30,7 @@ import org.junit.Test;
 
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 /**
  * Unit tests for the {@link XHTMLContentHandler} class.
@@ -121,6 +124,24 @@ public class XHTMLContentHandlerTest {
         assertEquals("two", words[1]);
     }
 
+    @Test
+    public void testAttributesOnBody() throws Exception {
+        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
+        XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(toHTMLContentHandler, new Metadata());
+        AttributesImpl attributes = new AttributesImpl();
+
+        attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
+        attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event");
+
+        xhtmlContentHandler.startDocument();
+        xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes);
+        xhtmlContentHandler.endElement("body");
+        xhtmlContentHandler.endDocument();
+
+        System.err.println("Content: " + toHTMLContentHandler.toString());
+        assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
+    }
+
     /**
      * Return array of non-zerolength words. Splitting on whitespace will get us
      * empty words for emptylines.