You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/13 23:06:10 UTC

svn commit: r890127 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/HtmlHandler.java test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: jukka
Date: Sun Dec 13 22:06:10 2009
New Revision: 890127

URL: http://svn.apache.org/viewvc?rev=890127&view=rev
Log:
TIKA-343: some parsers produces glued words

Use the newline rules from XHTMLContentHandler in HtmlHandler in cases where the HtmlMapper does not map incoming HTML element to something different.

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=890127&r1=890126&r2=890127&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Sun Dec 13 22:06:10 2009
@@ -133,10 +133,12 @@
                 xhtml.endElement(safe);
             } else if ("A".equals(name)) {
                 xhtml.endElement("a");
-            } else if ("BR".equals(name)) {
-                // TIKA-343: Map <br> tags to newlines, unless the HtmlMapper
-                // above has already determined to map them to something else
-                xhtml.characters("\n");
+            } else if (XHTMLContentHandler.ENDLINE.contains(
+                    name.toLowerCase())) {
+                // TIKA-343: Replace closing block tags (and <br/>) with a
+                // newline unless the HtmlMapper above has already mapped
+                // them to something else
+                xhtml.newline();
             }
         }
 

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890127&r1=890126&r2=890127&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun Dec 13 22:06:10 2009
@@ -274,20 +274,22 @@
     }
 
     /**
-     * Test case for HTML content like "foo&gt;br&lt;bar" that should result
-     * in two whitespace-separated tokens "foo" and "bar" instead of a single
-     * token "foobar".
+     * Test case for HTML content like
+     * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
+     * in three whitespace-separated tokens "foo", "bar" and "baz" instead
+     * of a single token "foobarbaz".
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
      */
     public void testLineBreak() throws Exception {
-        String test = "<html><body><p>foo<br>bar</p></body></html>";
+        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
         String text = new Tika().parseToString(
                 new ByteArrayInputStream(test.getBytes("US-ASCII")));
         String[] parts = text.trim().split("\\s+");
-        assertEquals(2, parts.length);
+        assertEquals(3, parts.length);
         assertEquals("foo", parts[0]);
         assertEquals("bar", parts[1]);
+        assertEquals("baz", parts[2]);
     }
 
 }