You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/04/06 13:38:27 UTC
svn commit: r931098 - in /lucene/nutch/trunk: ./ conf/ lib/ src/plugin/
src/plugin/parse-tika/ src/plugin/parse-tika/lib/
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/
Author: jnioche
Date: Tue Apr 6 11:38:26 2010
New Revision: 931098
URL: http://svn.apache.org/viewvc?rev=931098&view=rev
Log:
NUTCH-810 Upgraded to Tika 0.7
Added:
lucene/nutch/trunk/lib/tika-core-0.7.jar (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar (with props)
Removed:
lucene/nutch/trunk/lib/tika-core-0.6.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/tika-mimetypes.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Apr 6 11:38:26 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Unreleased Changes
+* NUTCH-810 Upgrade to Tika 0.7 (jnioche)
+
* NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche)
* NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)
Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/tika-mimetypes.xml (original)
+++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr 6 11:38:26 2010
@@ -2198,7 +2198,11 @@
<mime-type type="application/x-cpio">
<magic priority="50">
- <match value="070707" type="host16" offset="0"/>
+ <match value="070707" type="little16" offset="0"/>
+ <match value="070707" type="big16" offset="0"/>
+ <match value="070707" type="string" offset="0"/>
+ <match value="070701" type="string" offset="0"/>
+ <match value="070702" type="string" offset="0"/>
</magic>
<glob pattern="*.cpio"/>
</mime-type>
@@ -3551,7 +3555,13 @@
bad HTML, unfortunately.
-->
<root-XML localName="html"/>
+ <root-XML localName="HTML"/>
<root-XML localName="link"/>
+ <root-XML localName="LINK"/>
+ <root-XML localName="body"/>
+ <root-XML localName="BODY"/>
+ <root-XML localName="p"/>
+ <root-XML localName="P"/>
<magic priority="50">
<match value="<!DOCTYPE HTML" type="string" offset="0:64"/>
<match value="<!doctype html" type="string" offset="0:64"/>
Added: lucene/nutch/trunk/lib/tika-core-0.7.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr 6 11:38:26 2010
@@ -32,8 +32,8 @@
<ant dir="index-basic" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-more" target="deploy"/>
- <ant dir="field-basic" target="deploy"/>
- <ant dir="field-boost" target="deploy"/>
+ <ant dir="field-basic" target="deploy"/>
+ <ant dir="field-boost" target="deploy"/>
<ant dir="languageidentifier" target="deploy"/>
<ant dir="lib-http" target="deploy"/>
<ant dir="lib-jakarta-poi" target="deploy"/>
@@ -65,12 +65,12 @@
<ant dir="query-basic" target="deploy"/>
<ant dir="query-more" target="deploy"/>
<ant dir="query-site" target="deploy"/>
- <ant dir="query-custom" target="deploy"/>
+ <ant dir="query-custom" target="deploy"/>
<ant dir="query-url" target="deploy"/>
<ant dir="response-json" target="deploy"/>
<ant dir="response-xml" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
- <ant dir="scoring-link" target="deploy"/>
+ <ant dir="scoring-link" target="deploy"/>
<ant dir="summary-basic" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
<ant dir="summary-lucene" target="deploy"/>
@@ -99,7 +99,6 @@
<ant dir="protocol-httpclient" target="test"/>
<!--ant dir="parse-ext" target="test"/-->
<ant dir="parse-html" target="test"/>
- <!-- <ant dir="parse-mp3" target="test"/> -->
<ant dir="parse-msexcel" target="test"/>
<ant dir="parse-mspowerpoint" target="test"/>
<ant dir="parse-msword" target="test"/>
@@ -107,7 +106,6 @@
<ant dir="parse-pdf" target="test"/>
<ant dir="parse-rss" target="test"/>
<ant dir="feed" target="test"/>
- <!-- <ant dir="parse-rtf" target="test"/> -->
<ant dir="parse-swf" target="test"/>
<ant dir="parse-tika" target="test"/>
<ant dir="parse-zip" target="test"/>
@@ -172,11 +170,11 @@
<ant dir="query-more" target="clean"/>
<ant dir="query-site" target="clean"/>
<ant dir="query-url" target="clean"/>
- <ant dir="query-custom" target="clean"/>
+ <ant dir="query-custom" target="clean"/>
<ant dir="response-json" target="clean"/>
<ant dir="response-xml" target="clean"/>
<ant dir="scoring-opic" target="clean"/>
- <ant dir="scoring-link" target="clean"/>
+ <ant dir="scoring-link" target="clean"/>
<ant dir="subcollection" target="clean"/>
<ant dir="summary-basic" target="clean"/>
<ant dir="summary-lucene" target="clean"/>
Modified: lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Apr 6 11:38:26 2010
@@ -1,7 +1,7 @@
<ivy-module version="2.0">
<info organisation="apache" module="parse-tika"/>
<dependencies>
- <dependency org="org.apache.tika" name="tika-parsers" rev="0.6">
+ <dependency org="org.apache.tika" name="tika-parsers" rev="0.7">
<exclude module="lucene-*"/>
<exclude module="tika-core"/>
<exclude module="log4j"/>
Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml Tue Apr 6 11:38:26 2010
@@ -25,23 +25,26 @@
<library name="parse-tika.jar">
<export name="*"/>
</library>
+
<library name="asm-3.1.jar"/>
- <library name="bcprov-jdk14-136.jar"/>
<library name="bcmail-jdk14-136.jar"/>
+ <library name="bcmail-jdk15-1.45.jar"/>
+ <library name="bcprov-jdk14-136.jar"/>
+ <library name="bcprov-jdk15-1.45.jar"/>
<library name="commons-compress-1.0.jar"/>
<library name="commons-logging-1.1.1.jar"/>
<library name="dom4j-1.6.1.jar"/>
- <library name="fontbox-0.8.0-incubator.jar"/>
+ <library name="fontbox-1.1.0.jar"/>
<library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
- <library name="jempbox-0.8.0-incubator.jar"/>
+ <library name="jempbox-1.1.0.jar"/>
<library name="metadata-extractor-2.4.0-beta-1.jar"/>
- <library name="pdfbox-0.8.0-incubating.jar"/>
+ <library name="pdfbox-1.1.0.jar"/>
<library name="poi-3.6.jar"/>
<library name="poi-ooxml-3.6.jar"/>
<library name="poi-ooxml-schemas-3.6.jar"/>
<library name="poi-scratchpad-3.6.jar"/>
<library name="tagsoup-1.2.jar"/>
- <library name="tika-parsers-0.6.jar"/>
+ <library name="tika-parsers-0.7.jar"/>
<library name="xml-apis-1.0.b2.jar"/>
<library name="xmlbeans-2.3.0.jar"/>
</runtime>
Modified: lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java Tue Apr 6 11:38:26 2010
@@ -21,16 +21,22 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.Map;
+import javax.imageio.spi.ServiceRegistry;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.mortbay.log.Log;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -38,143 +44,208 @@ import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
- * Parse xml config file. Duplicates the Tika equivalent but allows the classes of the parser to be found
- * by classloader
+ * Parse xml config file.
*/
-class TikaConfig {
-
- static final String DEFAULT_CONFIG_LOCATION =
- "/org/apache/tika/tika-config.xml";
+public class TikaConfig {
private final Map<String, Parser> parsers = new HashMap<String, Parser>();
- private static MimeTypes mimeTypes;
+ private final MimeTypes mimeTypes;
- TikaConfig(String file)
- throws TikaException, IOException, SAXException {
- this(new File(file));
+ public TikaConfig(String file) throws TikaException, IOException,
+ SAXException {
+ this(new File(file));
}
- TikaConfig(File file)
- throws TikaException, IOException, SAXException {
- this(getBuilder().parse(file));
+ public TikaConfig(File file) throws TikaException, IOException,
+ SAXException {
+ this(getBuilder().parse(file));
}
- TikaConfig(URL url)
- throws TikaException, IOException, SAXException {
- this(getBuilder().parse(url.toString()));
+ public TikaConfig(URL url) throws TikaException, IOException, SAXException {
+ this(getBuilder().parse(url.toString()));
}
- TikaConfig(InputStream stream)
- throws TikaException, IOException, SAXException {
- this(getBuilder().parse(stream));
+ public TikaConfig(InputStream stream) throws TikaException, IOException,
+ SAXException {
+ this(getBuilder().parse(stream));
}
- TikaConfig(Document document) throws TikaException, IOException {
- this(document.getDocumentElement());
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+ */
+ public TikaConfig(InputStream stream, Parser delegate)
+ throws TikaException, IOException, SAXException {
+ this(stream);
}
- TikaConfig(Element element) throws TikaException, IOException {
- Element mtr = getChild(element, "mimeTypeRepository");
- if (mtr != null) {
- mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
- }
+ public TikaConfig(Document document) throws TikaException, IOException {
+ this(document.getDocumentElement());
+ }
- NodeList nodes = element.getElementsByTagName("parser");
- for (int i = 0; i < nodes.getLength(); i++) {
- Element node = (Element) nodes.item(i);
- String name = node.getAttribute("class");
- try {
- Class<?> parserClass = Class.forName(name);
- Parser parser = (Parser) parserClass.newInstance();
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+ */
+ public TikaConfig(Document document, Parser delegate) throws TikaException,
+ IOException {
+ this(document);
+ }
+
+ public TikaConfig(Element element) throws TikaException, IOException {
+ Element mtr = getChild(element, "mimeTypeRepository");
+ if (mtr != null && mtr.hasAttribute("resource")) {
+ mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
+ } else {
+ mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
+ }
+
+ NodeList nodes = element.getElementsByTagName("parser");
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Element node = (Element) nodes.item(i);
+ String name = node.getAttribute("class");
+
+ try {
+ Class<?> parserClass = Class.forName(name);
+ Object instance = parserClass.newInstance();
+ if (!(instance instanceof Parser)) {
+ throw new TikaException(
+ "Configured class is not a Tika Parser: " + name);
+ }
+ Parser parser = (Parser) instance;
+
+ NodeList mimes = node.getElementsByTagName("mime");
+ if (mimes.getLength() > 0) {
+ for (int j = 0; j < mimes.getLength(); j++) {
+ parsers.put(getText(mimes.item(j)).trim(), parser);
+ }
+ } else {
+ ParseContext context = new ParseContext();
+ for (MediaType type : parser.getSupportedTypes(context)) {
+ parsers.put(type.toString(), parser);
+ }
+ }
+ } catch (ClassNotFoundException e) {
+ throw new TikaException("Configured parser class not found: "
+ + name, e);
+ } catch (IllegalAccessException e) {
+ throw new TikaException("Unable to access a parser class: "
+ + name, e);
+ } catch (InstantiationException e) {
+ throw new TikaException(
+ "Unable to instantiate a parser class: " + name, e);
+ }
+ }
+ }
+
+ public TikaConfig() throws MimeTypeException, IOException {
+ ParseContext context = new ParseContext();
+ Iterator<Parser> iterator = ServiceRegistry.lookupProviders(
+ Parser.class, this.getClass().getClassLoader());
+ while (iterator.hasNext()) {
+ Parser parser = iterator.next();
+ for (MediaType type : parser.getSupportedTypes(context)) {
+ parsers.put(type.toString(), parser);
+ }
+ }
+ mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
+ }
- NodeList mimes = node.getElementsByTagName("mime");
- for (int j = 0; j < mimes.getLength(); j++) {
- parsers.put(getText(mimes.item(j)).trim(), parser);
- }
- } catch (Throwable t) {
- // TODO: Log warning about an invalid parser configuration
- // For now we just ignore this parser class
- }
- }
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+ */
+ public TikaConfig(Element element, Parser delegate) throws TikaException,
+ IOException {
+ this(element);
}
private String getText(Node node) {
- if (node.getNodeType() == Node.TEXT_NODE) {
- return node.getNodeValue();
- } else if (node.getNodeType() == Node.ELEMENT_NODE) {
- StringBuilder builder = new StringBuilder();
- NodeList list = node.getChildNodes();
- for (int i = 0; i < list.getLength(); i++) {
- builder.append(getText(list.item(i)));
- }
- return builder.toString();
- } else {
- return "";
- }
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ return node.getNodeValue();
+ } else if (node.getNodeType() == Node.ELEMENT_NODE) {
+ StringBuilder builder = new StringBuilder();
+ NodeList list = node.getChildNodes();
+ for (int i = 0; i < list.getLength(); i++) {
+ builder.append(getText(list.item(i)));
+ }
+ return builder.toString();
+ } else {
+ return "";
+ }
}
/**
- * Returns the parser instance configured for the given MIME type.
- * Returns <code>null</code> if the given MIME type is unknown.
- *
- * @param mimeType MIME type
+ * Returns the parser instance configured for the given MIME type. Returns
+ * <code>null</code> if the given MIME type is unknown.
+ *
+ * @param mimeType
+ * MIME type
* @return configured Parser instance, or <code>null</code>
*/
- Parser getParser(String mimeType) {
- return parsers.get(mimeType);
+ public Parser getParser(String mimeType) {
+ return parsers.get(mimeType);
}
- Map<String, Parser> getParsers() {
- return parsers;
+ public Map<String, Parser> getParsers() {
+ return parsers;
}
- MimeTypes getMimeRepository(){
- return mimeTypes;
+ public MimeTypes getMimeRepository() {
+ return mimeTypes;
}
/**
- * Provides a default configuration (TikaConfig). Currently creates a
- * new instance each time it's called; we may be able to have it
- * return a shared instance once it is completely immutable.
- *
+ * Provides a default configuration (TikaConfig). Currently creates a new
+ * instance each time it's called; we may be able to have it return a shared
+ * instance once it is completely immutable.
+ *
* @return default configuration
*/
- static TikaConfig getDefaultConfig() {
- try {
- InputStream stream =
- TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
- return new TikaConfig(stream);
- } catch (IOException e) {
- throw new RuntimeException(
- "Unable to read default configuration", e);
- } catch (SAXException e) {
- throw new RuntimeException(
- "Unable to parse default configuration", e);
- } catch (TikaException e) {
- throw new RuntimeException(
- "Unable to access default configuration", e);
- }
+ public static TikaConfig getDefaultConfig() {
+ try {
+ return new TikaConfig();
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to read default configuration",
+ e);
+ } catch (TikaException e) {
+ throw new RuntimeException(
+ "Unable to access default configuration", e);
+ }
+ }
+
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+ */
+ public static TikaConfig getDefaultConfig(Parser delegate)
+ throws TikaException {
+ return getDefaultConfig();
}
private static DocumentBuilder getBuilder() throws TikaException {
- try {
- return DocumentBuilderFactory.newInstance().newDocumentBuilder();
- } catch (ParserConfigurationException e) {
- throw new TikaException("XML parser not available", e);
- }
+ try {
+ return DocumentBuilderFactory.newInstance().newDocumentBuilder();
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("XML parser not available", e);
+ }
}
private static Element getChild(Element element, String name) {
- Node child = element.getFirstChild();
- while (child != null) {
- if (child.getNodeType() == Node.ELEMENT_NODE
- && name.equals(child.getNodeName())) {
- return (Element) child;
- }
- child = child.getNextSibling();
- }
- return null;
+ Node child = element.getFirstChild();
+ while (child != null) {
+ if (child.getNodeType() == Node.ELEMENT_NODE
+ && name.equals(child.getNodeName())) {
+ return (Element) child;
+ }
+ child = child.getNextSibling();
+ }
+ return null;
}
-}
+}
\ No newline at end of file
Modified: lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Apr 6 11:38:26 2010
@@ -170,6 +170,8 @@ public class TikaParser implements org.a
this.tikaConfig = null;
// do we want a custom Tika configuration file
+ // deprecated since Tika 0.7 which is based on
+ // a service provider based configuration
String customConfFile = conf.get("tika.config.file");
if (customConfFile != null) {
try {