You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/13 23:06:10 UTC
svn commit: r890127 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/HtmlHandler.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
Author: jukka
Date: Sun Dec 13 22:06:10 2009
New Revision: 890127
URL: http://svn.apache.org/viewvc?rev=890127&view=rev
Log:
TIKA-343: some parsers produces glued words
Use the newline rules from XHTMLContentHandler in HtmlHandler in cases where the HtmlMapper does not map incoming HTML element to something different.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=890127&r1=890126&r2=890127&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Sun Dec 13 22:06:10 2009
@@ -133,10 +133,12 @@
xhtml.endElement(safe);
} else if ("A".equals(name)) {
xhtml.endElement("a");
- } else if ("BR".equals(name)) {
- // TIKA-343: Map <br> tags to newlines, unless the HtmlMapper
- // above has already determined to map them to something else
- xhtml.characters("\n");
+ } else if (XHTMLContentHandler.ENDLINE.contains(
+ name.toLowerCase())) {
+ // TIKA-343: Replace closing block tags (and <br/>) with a
+ // newline unless the HtmlMapper above has already mapped
+ // them to something else
+ xhtml.newline();
}
}
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890127&r1=890126&r2=890127&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun Dec 13 22:06:10 2009
@@ -274,20 +274,22 @@
}
/**
- * Test case for HTML content like "foo>br<bar" that should result
- * in two whitespace-separated tokens "foo" and "bar" instead of a single
- * token "foobar".
+ * Test case for HTML content like
+ * ">div<foo>br<bar>/div>" that should result
+ * in three whitespace-separated tokens "foo", "bar" and "baz" instead
+ * of a single token "foobarbaz".
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
*/
public void testLineBreak() throws Exception {
- String test = "<html><body><p>foo<br>bar</p></body></html>";
+ String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
String text = new Tika().parseToString(
new ByteArrayInputStream(test.getBytes("US-ASCII")));
String[] parts = text.trim().split("\\s+");
- assertEquals(2, parts.length);
+ assertEquals(3, parts.length);
assertEquals("foo", parts[0]);
assertEquals("bar", parts[1]);
+ assertEquals("baz", parts[2]);
}
}