You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2013/08/28 19:17:50 UTC
svn commit: r1518288 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
test/resources/test-documents/boilerplate-whitespace.html
Author: kkrugler
Date: Wed Aug 28 17:17:50 2013
New Revision: 1518288
URL: http://svn.apache.org/r1518288
Log:
TIKA-961: No whitespace added if BoilerpipeContentHandler.setIncludeMarkup(true)
Thanks to Markus Jelsma for the patch.
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1518288&r1=1518287&r2=1518288&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Aug 28 17:17:50 2013
@@ -283,6 +283,7 @@ public class BoilerpipeContentHandler ex
// Now have bits set for all valid character runs. Replay our recorded elements,
// but only emit character runs flagged as valid.
int curCharsIndex = headerCharOffset;
+
for (RecordedElement element : elements) {
switch (element.getElementType()) {
case START:
@@ -297,6 +298,14 @@ public class BoilerpipeContentHandler ex
if (validCharacterRuns.get(curCharsIndex)) {
delegate.characters(chars, 0, chars.length);
+
+ // https://issues.apache.org/jira/browse/TIKA-961
+ if (!Character.isWhitespace(chars[chars.length - 1])) {
+ // Only add whitespace for certain elements
+ if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
}
}
break;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1518288&r1=1518287&r2=1518288&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Aug 28 17:17:50 2013
@@ -103,15 +103,15 @@ public class HtmlParserTest extends Test
HtmlParserTest.class.getResourceAsStream(path), metadata);
assertTrue("Did not contain expected text:"
- + "Title : Tilte with UTF-8 chars öäå", content
- .contains("Title : Tilte with UTF-8 chars öäå"));
+ + "Title : Tilte with UTF-8 chars âââ§ââ¢", content
+ .contains("Title : Tilte with UTF-8 chars âââ§ââ¢"));
assertTrue("Did not contain expected text:"
+ "Content with UTF-8 chars", content
.contains("Content with UTF-8 chars"));
- assertTrue("Did not contain expected text:" + "åäö", content
- .contains("åäö"));
+ assertTrue("Did not contain expected text:" + "ââ¢â§ââ", content
+ .contains("ââ¢â§ââ"));
}
public void testXhtmlParsing() throws Exception {
@@ -783,6 +783,35 @@ public class HtmlParserTest extends Test
}
/**
+ * Test case for TIKA-961
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
+ */
+ public void testBoilerplateWhitespace() throws Exception {
+ String path = "/test-documents/boilerplate-whitespace.html";
+
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+ bpHandler.setIncludeMarkup(true);
+
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ bpHandler, metadata, new ParseContext());
+
+ String content = handler.toString();
+
+ // Should not contain item_aitem_b
+ assertFalse(content.contains("item_aitem_b"));
+
+ // Should contain the two list items with a newline in between.
+ assertTrue(content.contains("item_a\nitem_b"));
+
+ // Should contain æä»ä¹éè¦æå¸®ä½ ç (can i help you) without whitespace
+ assertTrue(content.contains("æä»ä¹éè¦æå¸®ä½ ç"));
+ }
+
+ /**
* Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html?rev=1518288&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html Wed Aug 28 17:17:50 2013
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+ <title>Title</title>
+</head>
+<body>
+
+<p>The following unordered list must contain whitespace between both items:
+<ul>
+ <li>item_a</li><li>item_b</li>
+</ul>
+Whitespace should not be added around elements like anchors, em or string etc. This
+breaks CJK: æä»<strong>ä¹é</strong>è¦æå¸®ä½ ç?
+</p>
+
+<p>
+Here's another paragraph of text.
+This is the end of the text.
+</p>
+
+<p><a href="Footer.html">footer</a></p>
+
+</body>
+</html>
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate-whitespace.html
------------------------------------------------------------------------------
svn:mime-type = text/html