You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2013/08/28 19:17:50 UTC

svn commit: r1518288 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java test/java/org/apache/tika/parser/html/HtmlParserTest.java test/resources/test-documents/boilerplate-whitespace.html

Author: kkrugler
Date: Wed Aug 28 17:17:50 2013
New Revision: 1518288

URL: http://svn.apache.org/r1518288
Log:
TIKA-961: No whitespace added if BoilerpipeContentHandler.setIncludeMarkup(true)

Thanks to Markus Jelsma for the patch.

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1518288&r1=1518287&r2=1518288&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Aug 28 17:17:50 2013
@@ -283,6 +283,7 @@ public class BoilerpipeContentHandler ex
             // Now have bits set for all valid character runs. Replay our recorded elements,
             // but only emit character runs flagged as valid.
             int curCharsIndex = headerCharOffset;
+
             for (RecordedElement element : elements) {
                 switch (element.getElementType()) {
                     case START:
@@ -297,6 +298,14 @@ public class BoilerpipeContentHandler ex
 
                             if (validCharacterRuns.get(curCharsIndex)) {
                                 delegate.characters(chars, 0, chars.length);
+
+                                // https://issues.apache.org/jira/browse/TIKA-961
+                                if (!Character.isWhitespace(chars[chars.length - 1])) {
+                                    // Only add whitespace for certain elements
+                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+                                        delegate.ignorableWhitespace(NL, 0, NL.length);
+                                    }
+                                }
                             }
                         }
                         break;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1518288&r1=1518287&r2=1518288&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Aug 28 17:17:50 2013
@@ -103,15 +103,15 @@ public class HtmlParserTest extends Test
                 HtmlParserTest.class.getResourceAsStream(path), metadata);
 
         assertTrue("Did not contain expected text:"
-                + "Title : Tilte with UTF-8 chars öäå", content
-                .contains("Title : Tilte with UTF-8 chars öäå"));
+                + "Title : Tilte with UTF-8 chars öäå", content
+                .contains("Title : Tilte with UTF-8 chars öäå"));
 
         assertTrue("Did not contain expected text:"
                 + "Content with UTF-8 chars", content
                 .contains("Content with UTF-8 chars"));
 
-        assertTrue("Did not contain expected text:" + "åäö", content
-                .contains("åäö"));
+        assertTrue("Did not contain expected text:" + "åäö", content
+                .contains("åäö"));
     }
 
     public void testXhtmlParsing() throws Exception {
@@ -783,6 +783,35 @@ public class HtmlParserTest extends Test
     }
 
     /**
+     * Test case for TIKA-961
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
+     */
+    public void testBoilerplateWhitespace() throws Exception {
+        String path = "/test-documents/boilerplate-whitespace.html";
+        
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+        
+        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+        bpHandler.setIncludeMarkup(true);
+        
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                bpHandler,  metadata, new ParseContext());
+        
+        String content = handler.toString();
+
+        // Should not contain item_aitem_b
+        assertFalse(content.contains("item_aitem_b"));
+
+        // Should contain the two list items with a newline in between.
+        assertTrue(content.contains("item_a\nitem_b"));
+
+        // Should contain 有什么需要我帮你的 (can i help you) without whitespace
+        assertTrue(content.contains("有什么需要我帮你的"));
+    }
+
+    /**
      * Test case for TIKA-983:  HTML parser should add Open Graph meta tag data to Metadata returned by parser
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html?rev=1518288&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html Wed Aug 28 17:17:50 2013
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+        <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+        <title>Title</title>
+</head>
+<body>
+
+<p>The following unordered list must contain whitespace between both items:
+<ul>
+ <li>item_a</li><li>item_b</li>
+</ul>
+Whitespace should not be added around elements like anchors, em or string etc. This
+breaks CJK: 有什<strong>么需</strong>要我帮你的?
+</p>
+
+<p>
+Here's another paragraph of text.
+This is the end of the text.
+</p>
+
+<p><a href="Footer.html">footer</a></p>
+
+</body>
+</html>

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html
------------------------------------------------------------------------------
    svn:mime-type = text/html