You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/02/22 20:54:21 UTC

svn commit: r1661539 - in /nutch/branches/2.x: ./ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/

Author: lewismc
Date: Sun Feb 22 19:54:21 2015
New Revision: 1661539

URL: http://svn.apache.org/r1661539
Log:
NUTCH-1925 Upgrade Tika to version 1.7

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
    nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1661539&r1=1661538&r2=1661539&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sun Feb 22 19:54:21 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich via lewismc)
+
 * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
 
 * NUTCH-1924 Nutch + HBase Docker (Radosław Stankiewicz via lewismc)

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=1661539&r1=1661538&r2=1661539&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java Sun Feb 22 19:54:21 2015
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.tika;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-import javax.imageio.spi.ServiceRegistry;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.mime.MimeTypesFactory;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-
-/**
- * Parse xml config file.
- */
-public class TikaConfig {
-
-  private final Map<String, Parser> parsers = new HashMap<String, Parser>();
-
-  private final MimeTypes mimeTypes;
-
-  public TikaConfig(String file) throws TikaException, IOException,
-      SAXException {
-    this(new File(file));
-  }
-
-  public TikaConfig(File file) throws TikaException, IOException, SAXException {
-    this(getBuilder().parse(file));
-  }
-
-  public TikaConfig(URL url) throws TikaException, IOException, SAXException {
-    this(getBuilder().parse(url.toString()));
-  }
-
-  public TikaConfig(InputStream stream) throws TikaException, IOException,
-      SAXException {
-    this(getBuilder().parse(stream));
-  }
-
-  /**
-   * @deprecated This method will be removed in Apache Tika 1.0
-   * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
-   */
-  public TikaConfig(InputStream stream, Parser delegate) throws TikaException,
-      IOException, SAXException {
-    this(stream);
-  }
-
-  public TikaConfig(Document document) throws TikaException, IOException {
-    this(document.getDocumentElement());
-  }
-
-  /**
-   * @deprecated This method will be removed in Apache Tika 1.0
-   * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
-   */
-  public TikaConfig(Document document, Parser delegate) throws TikaException,
-      IOException {
-    this(document);
-  }
-
-  public TikaConfig(Element element) throws TikaException, IOException {
-    Element mtr = getChild(element, "mimeTypeRepository");
-    if (mtr != null && mtr.hasAttribute("resource")) {
-      mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
-    } else {
-      mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
-    }
-
-    NodeList nodes = element.getElementsByTagName("parser");
-    for (int i = 0; i < nodes.getLength(); i++) {
-      Element node = (Element) nodes.item(i);
-      String name = node.getAttribute("class");
-
-      try {
-        Class<?> parserClass = Class.forName(name);
-        Object instance = parserClass.newInstance();
-        if (!(instance instanceof Parser)) {
-          throw new TikaException("Configured class is not a Tika Parser: "
-              + name);
-        }
-        Parser parser = (Parser) instance;
-
-        NodeList mimes = node.getElementsByTagName("mime");
-        if (mimes.getLength() > 0) {
-          for (int j = 0; j < mimes.getLength(); j++) {
-            parsers.put(getText(mimes.item(j)).trim(), parser);
-          }
-        } else {
-          ParseContext context = new ParseContext();
-          for (MediaType type : parser.getSupportedTypes(context)) {
-            parsers.put(type.toString(), parser);
-          }
-        }
-      } catch (ClassNotFoundException e) {
-        throw new TikaException("Configured parser class not found: " + name, e);
-      } catch (IllegalAccessException e) {
-        throw new TikaException("Unable to access a parser class: " + name, e);
-      } catch (InstantiationException e) {
-        throw new TikaException(
-            "Unable to instantiate a parser class: " + name, e);
-      }
-    }
-  }
-
-  public TikaConfig() throws MimeTypeException, IOException {
-    ParseContext context = new ParseContext();
-    Iterator<Parser> iterator = ServiceRegistry.lookupProviders(Parser.class,
-        this.getClass().getClassLoader());
-    while (iterator.hasNext()) {
-      Parser parser = iterator.next();
-      for (MediaType type : parser.getSupportedTypes(context)) {
-        parsers.put(type.toString(), parser);
-      }
-    }
-    mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
-  }
-
-  /**
-   * @deprecated This method will be removed in Apache Tika 1.0
-   * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
-   */
-  public TikaConfig(Element element, Parser delegate) throws TikaException,
-      IOException {
-    this(element);
-  }
-
-  private String getText(Node node) {
-    if (node.getNodeType() == Node.TEXT_NODE) {
-      return node.getNodeValue();
-    } else if (node.getNodeType() == Node.ELEMENT_NODE) {
-      StringBuilder builder = new StringBuilder();
-      NodeList list = node.getChildNodes();
-      for (int i = 0; i < list.getLength(); i++) {
-        builder.append(getText(list.item(i)));
-      }
-      return builder.toString();
-    } else {
-      return "";
-    }
-  }
-
-  /**
-   * Returns the parser instance configured for the given MIME type. Returns
-   * <code>null</code> if the given MIME type is unknown.
-   * 
-   * @param mimeType
-   *          MIME type
-   * @return configured Parser instance, or <code>null</code>
-   */
-  public Parser getParser(String mimeType) {
-    return parsers.get(mimeType);
-  }
-
-  public Map<String, Parser> getParsers() {
-    return parsers;
-  }
-
-  public MimeTypes getMimeRepository() {
-    return mimeTypes;
-  }
-
-  /**
-   * Provides a default configuration (TikaConfig). Currently creates a new
-   * instance each time it's called; we may be able to have it return a shared
-   * instance once it is completely immutable.
-   * 
-   * @return default configuration
-   */
-  public static TikaConfig getDefaultConfig() {
-    try {
-      return new TikaConfig();
-    } catch (IOException e) {
-      throw new RuntimeException("Unable to read default configuration", e);
-    } catch (TikaException e) {
-      throw new RuntimeException("Unable to access default configuration", e);
-    }
-  }
-
-  /**
-   * @deprecated This method will be removed in Apache Tika 1.0
-   * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
-   */
-  public static TikaConfig getDefaultConfig(Parser delegate)
-      throws TikaException {
-    return getDefaultConfig();
-  }
-
-  private static DocumentBuilder getBuilder() throws TikaException {
-    try {
-      return DocumentBuilderFactory.newInstance().newDocumentBuilder();
-    } catch (ParserConfigurationException e) {
-      throw new TikaException("XML parser not available", e);
-    }
-  }
-
-  private static Element getChild(Element element, String name) {
-    Node child = element.getFirstChild();
-    while (child != null) {
-      if (child.getNodeType() == Node.ELEMENT_NODE
-          && name.equals(child.getNodeName())) {
-        return (Element) child;
-      }
-      child = child.getNextSibling();
-    }
-    return null;
-  }
-
-}
\ No newline at end of file

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1661539&r1=1661538&r2=1661539&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Sun Feb 22 19:54:21 2015
@@ -29,8 +29,11 @@ import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.TableUtil;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
@@ -87,7 +90,8 @@ public class TikaParser implements org.a
 
     // get the right parser using the mime type as a clue
     String mimeType = page.getContentType().toString();
-    Parser parser = tikaConfig.getParser(mimeType);
+    CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
+    Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
     ByteBuffer raw = page.getContent();
 
     if (parser == null) {

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1661539&r1=1661538&r2=1661539&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java Sun Feb 22 19:54:21 2015
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.parse.tika;
 
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -195,7 +197,8 @@ public class DOMContentUtilsTest {
     utils = new DOMContentUtils(conf);
     TikaParser tikaParser = new TikaParser();
     tikaParser.setConf(conf);
-    Parser parser = tikaParser.getTikaConfig().getParser("text/html");
+    CompositeParser compositeParser = (CompositeParser) tikaParser.getTikaConfig().getParser();
+    Parser parser = compositeParser.getParsers().get(MediaType.parse("text/html"));
     for (int i = 0; i < testPages.length; i++) {
       Metadata tikamd = new Metadata();