You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2015/03/03 03:18:54 UTC
svn commit: r1663513 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/sax/XHTMLContentHandler.java
test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Author: tpalsulich
Date: Tue Mar 3 02:18:53 2015
New Revision: 1663513
URL: http://svn.apache.org/r1663513
Log:
TIKA-995. Properly output XHTML body attributes, contributed by Markus Jelsma.
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=1663513&r1=1663512&r2=1663513&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Tue Mar 3 02:18:53 2015
@@ -60,7 +60,7 @@ public class XHTMLContentHandler extends
* skip them if they get sent to startElement/endElement by mistake.
*/
private static final Set<String> AUTO =
- unmodifiableSet("html", "head", "body", "frameset");
+ unmodifiableSet("html", "head", "frameset");
/**
* The elements that get prepended with the {@link #TAB} character.
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=1663513&r1=1663512&r2=1663513&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java Tue Mar 3 02:18:53 2015
@@ -17,10 +17,12 @@
package org.apache.tika.sax;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.List;
+import org.apache.tika.config.TikaConfigTest;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
@@ -28,6 +30,7 @@ import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
* Unit tests for the {@link XHTMLContentHandler} class.
@@ -121,6 +124,24 @@ public class XHTMLContentHandlerTest {
assertEquals("two", words[1]);
}
+ @Test
+ public void testAttributesOnBody() throws Exception {
+ ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
+ XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(toHTMLContentHandler, new Metadata());
+ AttributesImpl attributes = new AttributesImpl();
+
+ attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
+ attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event");
+
+ xhtmlContentHandler.startDocument();
+ xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes);
+ xhtmlContentHandler.endElement("body");
+ xhtmlContentHandler.endDocument();
+
+ System.err.println("Content: " + toHTMLContentHandler.toString());
+ assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
+ }
+
/**
* Return array of non-zerolength words. Splitting on whitespace will get us
* empty words for emptylines.