You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2010/10/26 18:59:52 UTC

svn commit: r1027652 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/sax/XHTMLContentHandler.java test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java

Author: kkrugler
Date: Tue Oct 26 16:59:52 2010
New Revision: 1027652

URL: http://svn.apache.org/viewvc?rev=1027652&view=rev
Log:
TIKA-394: Missing spaces on html parsing

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=1027652&r1=1027651&r2=1027652&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Tue Oct 26 16:59:52 2010
@@ -73,7 +73,7 @@ public class XHTMLContentHandler extends
     public static final Set<String> ENDLINE = unmodifiableSet(
             "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
             "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
-            "noscript", "li", "dt", "dd", "noframes", "br", "tr");
+            "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option");
 
     private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
 

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=1027652&r1=1027651&r2=1027652&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java Tue Oct 26 16:59:52 2010
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.sax;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -73,5 +76,60 @@ public class XHTMLContentHandlerTest ext
         assertEquals("a", words[4]);
         assertEquals("b", words[5]);
     }
+    
+    /**
+     * Test that content in option elements are properly separated in text
+     * output.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a>
+     */
+    public void testWhitespaceWithOptions() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("form");
+        xhtml.startElement("select");
+        xhtml.element("option", "opt1");
+        xhtml.element("option", "opt2");
+        xhtml.endElement("select");
+        xhtml.endElement("form");
+        xhtml.endDocument();
+
+        String[] words = output.toString().split("\\s+");
+        assertEquals(2, words.length);
+        assertEquals("opt1", words[0]);
+        assertEquals("opt2", words[1]);
+    }
+    
+    public void testWhitespaceWithMenus() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "one");
+        xhtml.element("li", "two");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        assertEquals(2, words.length);
+        assertEquals("one", words[0]);
+        assertEquals("two", words[1]);
+    }
+
+    /**
+     * Return array of non-zerolength words. Splitting on whitespace will get us
+     * empty words for emptylines.
+     * 
+     * @param string some mix of newlines and real words
+     * @return array of real words.
+     */
+    private static String[] getRealWords(String string) {
+        String[] possibleWords = string.split("\\s+");
+        List<String> words = new ArrayList<String>(possibleWords.length);
+        for (String word : possibleWords) {
+            if (word.length() > 0) {
+                words.add(word);
+            }
+        }
+        
+        return words.toArray(new String[words.size()]);
+    }
 
 }