You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2010/11/03 20:06:43 UTC

svn commit: r1030617 - in /tika/trunk/tika-parsers: pom.xml src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: kkrugler
Date: Wed Nov  3 19:06:43 2010
New Revision: 1030617

URL: http://svn.apache.org/viewvc?rev=1030617&view=rev
Log:
TIKA-462: Get Boilerpipe into Maven.

Here we switch to using Boilerpipe 1.1.0, where we also can get the (required by Sonatype) source and javadoc jars.

Updated code/test now that Boilerpipe bug with not flushing title has been fixed.

Modified:
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1030617&r1=1030616&r2=1030617&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Nov  3 19:06:43 2010
@@ -41,9 +41,9 @@
   <repositories>
     <!-- for boilerpipe, to be removed as per TIKA-462 when Boilerpipe is sync'd to Maven central from Sonatype -->
     <repository>
-      <id>maven2-repository.dev.java.net</id>
-      <name>Java.net Repository for Maven</name>
-      <url>http://download.java.net/maven/2/</url>
+      <id>maven2-repository.googlecode.com</id>
+      <name>GoogleCode Repository for Maven</name>
+      <url>http://boilerpipe.googlecode.com/svn/repo/</url>
       <layout>default</layout>
     </repository>
   </repositories>
@@ -135,7 +135,7 @@
     <dependency>
       <groupId>de.l3s.boilerpipe</groupId>
       <artifactId>boilerpipe</artifactId>
-      <version>1.0.4</version>
+      <version>1.1.0</version>
     </dependency>
     <dependency>
       <groupId>rome</groupId>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1030617&r1=1030616&r2=1030617&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Nov  3 19:06:43 2010
@@ -110,6 +110,7 @@ public class BoilerpipeContentHandler ex
         if (td.getTitle() != null) {
             char[] titleChars = td.getTitle().toCharArray();
             delegate.characters(titleChars, 0, titleChars.length);
+            delegate.ignorableWhitespace(NL, 0, NL.length);
         }
         
         delegate.endElement(XHTMLContentHandler.XHTML, "title", "title");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1030617&r1=1030616&r2=1030617&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Nov  3 19:06:43 2010
@@ -400,7 +400,7 @@ public class HtmlParserTest extends Test
                 new BoilerpipeContentHandler(handler),  metadata, new ParseContext());
         
         String content = handler.toString();
-        assertTrue(content.startsWith("This is the real meat"));
+        assertTrue(content.startsWith("Title\nThis is the real meat"));
         assertTrue(content.endsWith("This is the end of the text.\n"));
         assertFalse(content.contains("boilerplate"));
         assertFalse(content.contains("footer"));