You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2010/11/03 20:06:43 UTC
svn commit: r1030617 - in /tika/trunk/tika-parsers: pom.xml
src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Author: kkrugler
Date: Wed Nov 3 19:06:43 2010
New Revision: 1030617
URL: http://svn.apache.org/viewvc?rev=1030617&view=rev
Log:
TIKA-462: Get Boilerpipe into Maven.
Here we switch to using Boilerpipe 1.1.0, where we also can get the (required by Sonatype) source and javadoc jars.
Updated code/test now that Boilerpipe bug with not flushing title has been fixed.
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1030617&r1=1030616&r2=1030617&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Nov 3 19:06:43 2010
@@ -41,9 +41,9 @@
<repositories>
<!-- for boilerpipe, to be removed as per TIKA-462 when Boilerpipe is sync'd to Maven central from Sonatype -->
<repository>
- <id>maven2-repository.dev.java.net</id>
- <name>Java.net Repository for Maven</name>
- <url>http://download.java.net/maven/2/</url>
+ <id>maven2-repository.googlecode.com</id>
+ <name>GoogleCode Repository for Maven</name>
+ <url>http://boilerpipe.googlecode.com/svn/repo/</url>
<layout>default</layout>
</repository>
</repositories>
@@ -135,7 +135,7 @@
<dependency>
<groupId>de.l3s.boilerpipe</groupId>
<artifactId>boilerpipe</artifactId>
- <version>1.0.4</version>
+ <version>1.1.0</version>
</dependency>
<dependency>
<groupId>rome</groupId>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1030617&r1=1030616&r2=1030617&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Nov 3 19:06:43 2010
@@ -110,6 +110,7 @@ public class BoilerpipeContentHandler ex
if (td.getTitle() != null) {
char[] titleChars = td.getTitle().toCharArray();
delegate.characters(titleChars, 0, titleChars.length);
+ delegate.ignorableWhitespace(NL, 0, NL.length);
}
delegate.endElement(XHTMLContentHandler.XHTML, "title", "title");
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1030617&r1=1030616&r2=1030617&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Nov 3 19:06:43 2010
@@ -400,7 +400,7 @@ public class HtmlParserTest extends Test
new BoilerpipeContentHandler(handler), metadata, new ParseContext());
String content = handler.toString();
- assertTrue(content.startsWith("This is the real meat"));
+ assertTrue(content.startsWith("Title\nThis is the real meat"));
assertTrue(content.endsWith("This is the end of the text.\n"));
assertFalse(content.contains("boilerplate"));
assertFalse(content.contains("footer"));