You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2007/09/24 16:44:18 UTC
svn commit: r578843 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/
src/main/java/org/apache/tika/parser/html/
src/main/java/org/apache/tika/utils/ src/main/resources/mime/
src/test/java/org/apache/tika/ src/test/java/org/apache/ti...
Author: mattmann
Date: Mon Sep 24 07:44:18 2007
New Revision: 578843
URL: http://svn.apache.org/viewvc?rev=578843&view=rev
Log:
- fix for TIKA-17
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MimeTypesUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/utils/
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=578843&r1=578842&r2=578843&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Sep 24 07:44:18 2007
@@ -32,3 +32,6 @@
14. TIKA-6 - Port Nutch (or better) MimeType detection system into Tika (J. Charron & mattmann)
15. TIKA-25 - Removed hardcoded reference to C:\oo.xml in OpenOfficeParser (K. Bennett & jukka)
+
+16. TIKA-17 - Need to support URL's for input resources. (K. Bennett & mattmann)
+
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java?rev=578843&r1=578842&r2=578843&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java Mon Sep 24 07:44:18 2007
@@ -16,17 +16,13 @@
*/
package org.apache.tika.parser;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
+import java.io.InputStream;
+import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
import org.apache.tika.config.LiusConfig;
import org.apache.tika.config.ParserConfig;
import org.apache.tika.exception.LiusException;
-import org.apache.tika.utils.MimeTypesUtils;
-
-import org.apache.log4j.Logger;
-import org.jdom.JDOMException;
/**
* Factory class. Build parser from xml config file.
@@ -37,77 +33,80 @@
static Logger logger = Logger.getRootLogger();
- /**
- * Build parser from file and Lius config object
- */
- public static Parser getParser(File file, LiusConfig tc)
- throws IOException, LiusException {
- if(!file.canRead()) {
- throw new IOException("Cannot read input file " + file.getAbsoluteFile());
+
+
+ public static Parser getParser(
+ InputStream inputStream, String mimeType, LiusConfig tc)
+ throws LiusException {
+
+ // Verify that all passed parameters are (probably) valid.
+
+ if (StringUtils.isBlank(mimeType)) {
+ throw new LiusException("Mime type not specified.");
}
- String mimeType = MimeTypesUtils.getMimeType(file);
- ParserConfig pc = tc.getParserConfig(mimeType);
- if(pc==null) {
- throw new LiusException(
- "No ParserConfig available for mime-type '" + mimeType + "'"
- + " for file " + file.getName()
- );
+
+ if (inputStream == null) {
+ throw new LiusException("Input stream is null.");
+ }
+
+ if (tc == null) {
+ throw new LiusException("Configuration object is null.");
}
+
+ ParserConfig pc = getParserConfig(mimeType, tc);
+ if (pc == null) {
+ throw new LiusException(
+ "Could not find parser config for mime type "
+ + mimeType + ".");
+ }
+
String className = pc.getParserClass();
Parser parser = null;
- Class<?> parserClass = null;
- if (className != null) {
- try {
- logger.debug(
- "Loading parser class = " + className
- + " MimeType = " + mimeType
- + " for file " + file.getName()
- );
-
- parserClass = Class.forName(className);
- parser = (Parser) parserClass.newInstance();
-
- } catch (ClassNotFoundException e) {
- logger.error(e.getMessage());
-
- } catch (InstantiationException e) {
- logger.error(e.getMessage());
- } catch (IllegalAccessException e) {
- logger.error(e.getMessage());
- }
+
+ if (StringUtils.isBlank(className)) {
+ throw new LiusException(
+ "Parser class name missing from ParserConfig.");
+ }
+
+ try {
+ logger.info("Loading parser class = " + className
+ + " MimeType = " + mimeType);
+
+ Class<?> parserClass = Class.forName(className);
+ parser = (Parser) parserClass.newInstance();
parser.setMimeType(mimeType);
- parser.setNamespace(pc.getNameSpace());
parser.setContents(pc.getContents());
- parser.setInputStream(new FileInputStream(file));
+ parser.setInputStream(inputStream);
+
+ } catch (ClassNotFoundException e) {
+ logger.error(e.getMessage());
+ throw new LiusException(e.getMessage());
+ } catch (InstantiationException e) {
+ logger.error(e.getMessage());
+ throw new LiusException(e.getMessage());
+ } catch (IllegalAccessException e) {
+ logger.error(e.getMessage());
+ throw new LiusException(e.getMessage());
}
return parser;
}
- /**
- * Build parser from string file path and Lius config object
- */
- public static Parser getParser(String str, LiusConfig tc)
- throws IOException, LiusException {
- return getParser(new File(str), tc);
- }
- /**
- * Build parser from string file path and Lius config file path
- */
- public static Parser getParser(String str, String tcPath)
- throws IOException, LiusException, JDOMException {
- LiusConfig tc = LiusConfig.getInstance(tcPath);
- return getParser(new File(str), tc);
- }
+ private static ParserConfig getParserConfig(String mimeType, LiusConfig tc)
+ throws LiusException {
- /**
- * Build parser from file and Lius config file path
- */
- public static Parser getParser(File file, String tcPath)
- throws IOException, LiusException, JDOMException {
- LiusConfig tc = LiusConfig.getInstance(tcPath);
- return getParser(file, tc);
- }
+ ParserConfig pc = tc.getParserConfig(mimeType);
+
+ if (pc == null) {
+ String message =
+ "Could not find parser configuration for mime type "
+ + mimeType + ".";
+ logger.error(message);
+ throw new LiusException(message);
+ }
+
+ return pc;
+ }
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=578843&r1=578842&r2=578843&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Mon Sep 24 07:44:18 2007
@@ -17,17 +17,17 @@
package org.apache.tika.parser.html;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import org.apache.log4j.Logger;
+import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
import org.apache.tika.utils.RegexUtils;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -61,6 +61,12 @@
}
List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
+
+
+ if (ctt == null) {
+ return new ArrayList<Content>(0);
+ }
+
Iterator i = ctt.iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
Added: incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=578843&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java Mon Sep 24 07:44:18 2007
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+// JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+// TIKA imports
+import org.apache.tika.config.LiusConfig;
+import org.apache.tika.exception.LiusException;
+import org.apache.tika.metadata.TikaMimeKeys;
+import org.apache.tika.mime.MimeUtils;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserFactory;
+
+/**
+ * Contains utility methods for parsing documents. Intended to provide simple
+ * entry points into the Tika framework.
+ */
+public class ParseUtils implements TikaMimeKeys {
+
+ private static final Configuration conf = new Configuration();
+
+ static {
+ conf.set(TIKA_MIME_FILE, "org/apache/tika/mime/tika-mimetypes.xml");
+ }
+
+ private static final MimeUtils mimeUtils = new MimeUtils(conf);
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set to
+ * receive input from a stream opened from the specified URL. NB: Close the
+ * input stream when it is no longer needed!
+ *
+ * @param inputStream
+ * stream containing document data to parse
+ * @param config
+ * @param mimeType
+ * the document's MIME type
+ * @return a parser appropriate to this MIME type and ready to read input
+ * from the specified document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static Parser getParser(InputStream inputStream, LiusConfig config,
+ String mimeType) throws LiusException, IOException {
+
+ if (inputStream == null) {
+ throw new LiusException("Document input stream not provided.");
+ }
+
+ return ParserFactory.getParser(inputStream, mimeType, config);
+ }
+
+ // Note that we cannot provide a method that takes an InputStream
+ // but not a MIME type, since we will not have a resource
+ // name from which to derive it.
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set to
+ * receive input from a stream opened from the specified URL. NB: Close the
+ * input stream when it is no longer needed!
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @param mimeType
+ * the document's MIME type
+ * @return a parser appropriate to this MIME type and ready to read input
+ * from the specified document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static Parser getParser(URL documentUrl, LiusConfig config,
+ String mimeType) throws LiusException, IOException {
+
+ if (documentUrl == null) {
+ throw new LiusException("Document URL not provided.");
+ }
+
+ return ParserFactory.getParser(documentUrl.openStream(), mimeType,
+ config);
+ }
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set to
+ * receive input from a stream opened from the specified URL. The MIME type
+ * is determined automatically. NB: Close the input stream when it is no
+ * longer needed!
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @return a parser appropriate to this MIME type and ready to read input
+ * from the specified document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static Parser getParser(URL documentUrl, LiusConfig config)
+ throws LiusException, IOException {
+
+ String mimetype = mimeUtils.getRepository().getMimeType(documentUrl)
+ .getName();
+ return getParser(documentUrl, config, mimetype);
+ }
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set to
+ * receive input from a stream opened from the specified URL. NB: Close the
+ * input stream when it is no longer needed!
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @param mimeType
+ * the document's MIME type
+ * @return a parser appropriate to this MIME type and ready to read input
+ * from the specified document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static Parser getParser(File documentFile, LiusConfig config,
+ String mimeType) throws LiusException, IOException {
+
+ if (documentFile == null) {
+ throw new LiusException("Document file not provided.");
+ }
+
+ if (!documentFile.canRead()) {
+ throw new LiusException(
+ "Document file does not exist or is not readable.");
+ }
+
+ FileInputStream inputStream = new FileInputStream(documentFile);
+ // TODO: Do we want to wrap a BufferedInputStream, or does the
+ // file's buffering suffice?
+
+ return ParserFactory.getParser(inputStream, mimeType, config);
+ }
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set to
+ * receive input from a stream opened from the specified URL. NB: Close the
+ * input stream when it is no longer needed!
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @return a parser appropriate to this MIME type and ready to read input
+ * from the specified document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static Parser getParser(File documentFile, LiusConfig config)
+ throws LiusException, IOException {
+
+ String mimetype = mimeUtils.getRepository().getMimeType(documentFile)
+ .getName();
+ return getParser(documentFile, config, mimetype);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param inputStream
+ * the stream from which to read document data
+ * @param config
+ * @param mimeType
+ * MIME type of the data
+ * @return the string content parsed from the document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static String getStringContent(InputStream inputStream,
+ LiusConfig config, String mimeType) throws LiusException,
+ IOException {
+
+ Parser parser = getParser(inputStream, config, mimeType);
+ return getStringContent(parser);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @return the string content parsed from the document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static String getStringContent(URL documentUrl, LiusConfig config)
+ throws LiusException, IOException {
+
+ Parser parser = getParser(documentUrl, config);
+ return getStringContent(parser);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @param mimeType
+ * MIME type of the data
+ * @return the string content parsed from the document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static String getStringContent(URL documentUrl, LiusConfig config,
+ String mimeType) throws LiusException, IOException {
+
+ Parser parser = getParser(documentUrl, config, mimeType);
+ return getStringContent(parser);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @param mimeType
+ * MIME type of the data
+ * @return the string content parsed from the document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static String getStringContent(File documentFile, LiusConfig config,
+ String mimeType) throws LiusException, IOException {
+
+ Parser parser = getParser(documentFile, config, mimeType);
+ return getStringContent(parser);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @return the string content parsed from the document
+ * @throws LiusException
+ * @throws IOException
+ */
+ public static String getStringContent(File documentFile, LiusConfig config)
+ throws LiusException, IOException {
+
+ Parser parser = getParser(documentFile, config);
+ return getStringContent(parser);
+ }
+
+ private static String getStringContent(Parser parser) throws IOException {
+ String content = parser.getStrContent();
+ parser.getInputStream().close();
+ return content;
+ }
+}
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=578843&r1=578842&r2=578843&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Mon Sep 24 07:44:18 2007
@@ -156,6 +156,7 @@
</mime-type>
<mime-type type="application/rtf">
+ <glob pattern="*.rtf"/>
<alias type="text/rtf" />
</mime-type>
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=578843&r1=578842&r2=578843&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Mon Sep 24 07:44:18 2007
@@ -18,19 +18,17 @@
import java.io.File;
import java.io.IOException;
-import java.util.Collection;
-import java.util.StringTokenizer;
-
-import junit.framework.TestCase;
import org.apache.tika.config.Content;
import org.apache.tika.config.LiusConfig;
import org.apache.tika.log.LiusLogger;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserFactory;
+import org.apache.tika.utils.ParseUtils;
import org.apache.tika.utils.Utils;
import org.jdom.JDOMException;
+import junit.framework.TestCase;
+
/**
* Junit test class
* @author Rida Benjelloun (ridabenjelloun@apache.org)
@@ -67,58 +65,97 @@
LiusLogger.setLoggerConfigFile(log4jPropertiesFilename);
}
-
- /*
- * public void testConfig(){ TikaConfig tc =
- * TikaConfig.getInstance("C:\\tika\\config\\tikaConfig2.xml"); ParserConfig
- * pc = tc.getParserConfig("text/html"); assertEquals("parse-html",
- * pc.getName()); }
- */
public void testPDFExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testPDF.pdf"), tc);
+ File file = getTestFile("testPDF.pdf");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
+
+ Parser parser = ParseUtils.getParser(file, tc);
+ String s3 = parser.getStrContent();
+
+ assertEquals(s1, s2);
+ assertEquals(s1, s3);
}
public void testTXTExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testTXT.txt"), tc);
+ File file = getTestFile("testTXT.txt");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
+ assertEquals(s1, s2);
}
public void testRTFExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testRTF.rtf"), tc);
+ File file = getTestFile("testRTF.rtf");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
+ assertEquals(s1, s2);
}
public void testXMLExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testXML.xml"), tc);
+ File file = getTestFile("testXML.xml");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "application/xml");
+ assertEquals(s1, s2);
}
public void testPPTExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testPPT.ppt"), tc);
+ File file = getTestFile("testPPT.ppt");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc,
+ "application/vnd.ms-powerpoint");
+ assertEquals(s1, s2);
}
public void testWORDxtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testWORD.doc"), tc);
+ File file = getTestFile("testWORD.doc");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "application/msword");
+ assertEquals(s1, s2);
}
public void testEXCELExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testEXCEL.xls"), tc);
+ File file = getTestFile("testEXCEL.xls");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc,
+ "application/vnd.ms-excel");
+ assertEquals(s1, s2);
}
public void testOOExtraction() throws Exception {
- ParserFactory.getParser(getTestFile("testOpenOffice2.odt"), tc);
+ File file = getTestFile("testOpenOffice2.odt");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc,
+ "application/vnd.oasis.opendocument.text");
+ assertEquals(s1, s2);
}
public void testHTMLExtraction() throws Exception {
- Parser parser = ParserFactory.getParser(getTestFile("testHTML.html"), tc);
- assertEquals("Title : Test Indexation Html", (parser.getContent("title")).getValue());
- assertEquals("text/html",parser.getMimeType());
- final String text = Utils.toString(parser.getContents());
+ File file = getTestFile("testHTML.html");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "text/html");
+ assertEquals(s1, s2);
+
+ Parser parser = ParseUtils.getParser(file, tc);
+ assertNotNull(parser);
+ assertEquals("org.apache.tika.parser.html.HtmlParser", parser.getClass().getName());
+
+ Content content = parser.getContent("title");
+ assertNotNull(content);
+ assertEquals("Title : Test Indexation Html", content.getValue());
+
+ assertEquals("text/html", parser.getMimeType());
+
+ final String text = Utils.toString(parser.getContents());
final String expected = "Test Indexation Html";
- assertTrue("text contains '" + expected + "'",text.indexOf(expected) >= 0);
+ assertTrue("text contains '" + expected + "'",
+ text.contains(expected));
+ parser.getInputStream().close();
}
private File getTestFile(String filename) {
- return new File(testFilesBaseDir,filename);
+ return new File(testFilesBaseDir, filename);
}
}