You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/16 15:42:36 UTC
svn commit: r1730694 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Author: markus
Date: Tue Feb 16 14:42:36 2016
New Revision: 1730694
URL: http://svn.apache.org/viewvc?rev=1730694&view=rev
Log:
NUTCH-961 Expose Tika's Boilerpipe support
Added:
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730694&r1=1730693&r2=1730694&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 16 14:42:36 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-961 Expose Tika's Boilerpipe support (Gabriele Kahlout, Vincent Slot, markus)
+
* NUTCH-1233 Rely on Tika for outlink extraction (markus)
* NUTCH-2210 Upgrade to Tika 1.12 (markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1730694&r1=1730693&r2=1730694&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Feb 16 14:42:36 2016
@@ -1300,6 +1300,22 @@ CAUTION: Set the parser.timeout to -1 or
</description>
</property>
+<property>
+ <name>tika.extractor</name>
+ <value>none</value>
+ <description>
+ Which text extraction algorithm to use. Valid values are: boilerpipe or none.
+ </description>
+</property>
+
+<property>
+ <name>tika.extractor.boilerpipe.algorithm</name>
+ <value>ArticleExtractor</value>
+ <description>
+ Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor
+ or CanolaExtractor.
+ </description>
+</property>
<!-- urlfilter plugin properties -->
Added: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java?rev=1730694&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java (added)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java Tue Feb 16 14:42:36 2016
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+ public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
+ public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>();
+
+ /**
+ * Returns an instance of the specified extractor
+ */
+ public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+ // Check if there's no instance of this extractor
+ if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+ // FQCN
+ boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+
+ // Attempt to load the class
+ try {
+ ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+ Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+ // Add an instance to the repository
+ extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance());
+
+ } catch (ClassNotFoundException e) {
+ LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+ } catch (InstantiationException e) {
+ LOG.error("Could not instantiate " + boilerpipeExtractorName);
+ } catch (Exception e) {
+ LOG.error(e);
+ }
+ }
+
+ return extractorRepository.get(boilerpipeExtractorName);
+ }
+
+}
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1730694&r1=1730693&r2=1730694&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Feb 16 14:42:36 2016
@@ -40,6 +40,7 @@ import org.apache.nutch.protocol.Content
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
@@ -72,6 +73,9 @@ public class TikaParser implements org.a
@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
String mimeType = content.getContentType();
+
+ boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
+ String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
URL base;
try {
@@ -101,13 +105,25 @@ public class TikaParser implements org.a
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
- DOMBuilder domhandler = new DOMBuilder(doc, root);
+ ContentHandler domHandler;
+
+ // Check whether to use Tika's BoilerplateContentHandler
+ if (useBoilerpipe) {
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
+ BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+ bpHandler.setIncludeMarkup(true);
+ domHandler = (ContentHandler)bpHandler;
+ } else {
+ DOMBuilder domBuilder = new DOMBuilder(doc, root);
+ domBuilder.setUpperCaseElementNames(upperCaseElementNames);
+ domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+ domHandler = (ContentHandler)domBuilder;
+ }
+
LinkContentHandler linkContentHandler = new LinkContentHandler();
- domhandler.setUpperCaseElementNames(upperCaseElementNames);
- domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
ParseContext context = new ParseContext();
- TeeContentHandler teeContentHandler = new TeeContentHandler(domhandler, linkContentHandler);
+ TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);