You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/16 15:42:36 UTC

svn commit: r1730694 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Author: markus
Date: Tue Feb 16 14:42:36 2016
New Revision: 1730694

URL: http://svn.apache.org/viewvc?rev=1730694&view=rev
Log:
NUTCH-961 Expose Tika's Boilerpipe support

Added:
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730694&r1=1730693&r2=1730694&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 16 14:42:36 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-961 Expose Tika's Boilerpipe support (Gabriele Kahlout, Vincent Slot, markus)
+
 * NUTCH-1233 Rely on Tika for outlink extraction (markus)
 
 * NUTCH-2210 Upgrade to Tika 1.12 (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1730694&r1=1730693&r2=1730694&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Feb 16 14:42:36 2016
@@ -1300,6 +1300,22 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<property>
+  <name>tika.extractor</name>
+  <value>none</value>
+  <description>
+  Which text extraction algorithm to use. Valid values are: boilerpipe or none.
+  </description>
+</property>
+
+<property> 
+  <name>tika.extractor.boilerpipe.algorithm</name>
+  <value>ArticleExtractor</value>
+  <description> 
+  Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor
+  or CanolaExtractor.
+  </description>
+</property>
 
 <!-- urlfilter plugin properties -->
 

Added: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java?rev=1730694&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java (added)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java Tue Feb 16 14:42:36 2016
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+    public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
+    public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>();
+ 
+    /**
+     * Returns an instance of the specified extractor
+     */
+    public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+      // Check if there's no instance of this extractor
+      if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+        // FQCN
+        boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+
+        // Attempt to load the class
+        try {
+          ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+          Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+          // Add an instance to the repository
+          extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance());
+
+        } catch (ClassNotFoundException e) {
+          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+        } catch (InstantiationException e) {
+          LOG.error("Could not instantiate " + boilerpipeExtractorName);
+        } catch (Exception e) {
+          LOG.error(e);
+        }
+      }
+
+      return extractorRepository.get(boilerpipeExtractorName);
+    }
+
+}

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1730694&r1=1730693&r2=1730694&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Feb 16 14:42:36 2016
@@ -40,6 +40,7 @@ import org.apache.nutch.protocol.Content
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
@@ -72,6 +73,9 @@ public class TikaParser implements org.a
   @SuppressWarnings("deprecation")
   public ParseResult getParse(Content content) {
     String mimeType = content.getContentType();
+    
+    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
+    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
 
     URL base;
     try {
@@ -101,13 +105,25 @@ public class TikaParser implements org.a
     doc.setErrorChecking(false);
     DocumentFragment root = doc.createDocumentFragment();
 
-    DOMBuilder domhandler = new DOMBuilder(doc, root);
+    ContentHandler domHandler;
+    
+    // Check whether to use Tika's BoilerplateContentHandler
+    if (useBoilerpipe) {
+      BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
+      BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+      bpHandler.setIncludeMarkup(true);
+      domHandler = (ContentHandler)bpHandler;
+    } else {
+      DOMBuilder domBuilder = new DOMBuilder(doc, root);
+      domBuilder.setUpperCaseElementNames(upperCaseElementNames);
+      domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+      domHandler = (ContentHandler)domBuilder;
+    }
+
     LinkContentHandler linkContentHandler = new LinkContentHandler();
-    domhandler.setUpperCaseElementNames(upperCaseElementNames);
-    domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
 
     ParseContext context = new ParseContext();
-    TeeContentHandler teeContentHandler = new TeeContentHandler(domhandler, linkContentHandler);
+    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
     
     if (HTMLMapper != null)
       context.set(HtmlMapper.class, HTMLMapper);