You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2010/10/26 18:59:52 UTC
svn commit: r1027652 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/sax/XHTMLContentHandler.java
test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Author: kkrugler
Date: Tue Oct 26 16:59:52 2010
New Revision: 1027652
URL: http://svn.apache.org/viewvc?rev=1027652&view=rev
Log:
TIKA-394: Missing spaces on html parsing
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=1027652&r1=1027651&r2=1027652&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Tue Oct 26 16:59:52 2010
@@ -73,7 +73,7 @@ public class XHTMLContentHandler extends
public static final Set<String> ENDLINE = unmodifiableSet(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
- "noscript", "li", "dt", "dd", "noframes", "br", "tr");
+ "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option");
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=1027652&r1=1027651&r2=1027652&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java Tue Oct 26 16:59:52 2010
@@ -16,6 +16,9 @@
*/
package org.apache.tika.sax;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -73,5 +76,60 @@ public class XHTMLContentHandlerTest ext
assertEquals("a", words[4]);
assertEquals("b", words[5]);
}
+
+ /**
+ * Test that content in option elements are properly separated in text
+ * output.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a>
+ */
+ public void testWhitespaceWithOptions() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("form");
+ xhtml.startElement("select");
+ xhtml.element("option", "opt1");
+ xhtml.element("option", "opt2");
+ xhtml.endElement("select");
+ xhtml.endElement("form");
+ xhtml.endDocument();
+
+ String[] words = output.toString().split("\\s+");
+ assertEquals(2, words.length);
+ assertEquals("opt1", words[0]);
+ assertEquals("opt2", words[1]);
+ }
+
+ public void testWhitespaceWithMenus() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("menu");
+ xhtml.element("li", "one");
+ xhtml.element("li", "two");
+ xhtml.endElement("menu");
+ xhtml.endDocument();
+
+ String[] words = getRealWords(output.toString());
+ assertEquals(2, words.length);
+ assertEquals("one", words[0]);
+ assertEquals("two", words[1]);
+ }
+
+ /**
+ * Return array of non-zerolength words. Splitting on whitespace will get us
+ * empty words for emptylines.
+ *
+ * @param string some mix of newlines and real words
+ * @return array of real words.
+ */
+ private static String[] getRealWords(String string) {
+ String[] possibleWords = string.split("\\s+");
+ List<String> words = new ArrayList<String>(possibleWords.length);
+ for (String word : possibleWords) {
+ if (word.length() > 0) {
+ words.add(word);
+ }
+ }
+
+ return words.toArray(new String[words.size()]);
+ }
}