You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/09 16:28:27 UTC
svn commit: r635259 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/
src/main/java/org/apache/tika/parser/html/
src/main/java/org/apache/tika/parser/microsoft/
src/main/java/org/apache/tika/parser/opendocument/ src/main/java/org/apa...
Author: jukka
Date: Sun Mar 9 08:28:24 2008
New Revision: 635259
URL: http://svn.apache.org/viewvc?rev=635259&view=rev
Log:
TIKA-126: Add Parser.parse(InputStream, Metadata) for metadata extraction
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Mar 9 08:28:24 2008
@@ -19,6 +19,9 @@
8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
+9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction
+ (Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java?rev=635259&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java Sun Mar 9 08:28:24 2008
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Abstract parser base class. Contains a default implementation of the
+ * {@link #parse(InputStream, Metadata)} method.
+ */
+public abstract class AbstractParser implements Parser {
+
+ /**
+ * Calls the full
+ * {@link Parser#parse(InputStream, org.xml.sax.ContentHandler, Metadata)}
+ * method and keeps only the extracted metatdata.
+ */
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ try {
+ parse(stream, new DefaultHandler(), metadata);
+ } catch (SAXException e) {
+ throw new TikaException("Unexpected SAX error", e);
+ }
+ }
+
+}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Mar 9 08:28:24 2008
@@ -30,7 +30,7 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-public class AutoDetectParser implements Parser {
+public class AutoDetectParser extends AbstractParser {
private TikaConfig config;
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java Sun Mar 9 08:28:24 2008
@@ -16,10 +16,8 @@
*/
package org.apache.tika.parser;
-import java.io.IOException;
import java.io.InputStream;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
@@ -32,9 +30,12 @@
*/
public class EmptyParser implements Parser {
+ public void parse(InputStream stream, Metadata metadata) {
+ }
+
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
+ throws SAXException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java Sun Mar 9 08:28:24 2008
@@ -30,6 +30,12 @@
public class ErrorParser implements Parser {
public void parse(
+ InputStream stream, Metadata metadata)
+ throws TikaException {
+ throw new TikaException("Parse error");
+ }
+
+ public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws TikaException {
throw new TikaException("Parse error");
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Sun Mar 9 08:28:24 2008
@@ -30,6 +30,20 @@
public interface Parser {
/**
+ * Parses document metadata from the given document stream.
+ * <p>
+ * The given document stream is consumed but not closed by this method.
+ * The responsibility to close the stream remains on the caller.
+ *
+ * @param stream the document stream (input)
+ * @param metadata document metadata (input and output)
+ * @throws IOException if the document stream could not be read
+ * @throws TikaException if the document could not be parsed
+ */
+ void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException;
+
+ /**
* Parses a document stream into a sequence of XHTML SAX events.
* Fills in related document metadata in the given metadata object.
* <p>
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java Sun Mar 9 08:28:24 2008
@@ -30,7 +30,7 @@
* instance. Subclasses can provide extra decoration by overriding the
* parse method.
*/
-public class ParserDecorator implements Parser {
+public class ParserDecorator extends AbstractParser {
/**
* The decorated parser instance.
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Mar 9 08:28:24 2008
@@ -18,12 +18,13 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.Reader;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.utils.Utils;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -33,34 +34,16 @@
/**
* Simple HTML parser that extracts title.
*/
-public class HtmlParser implements Parser {
+public class HtmlParser extends AbstractParser {
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata) throws IOException, SAXException, TikaException {
-
- final SAXParser parser = new SAXParser();
-
- final InputSource source;
-
- Reader utf8Reader;
-
- try {
- utf8Reader = org.apache.tika.utils.Utils.getUTF8Reader(
- stream, metadata);
- } catch (TikaException ex) {
- utf8Reader = null;
- }
-
- if (utf8Reader == null) {
- source = new InputSource(stream);
- } else {
- source = new InputSource(utf8Reader);
- }
-
-
- parser.setContentHandler(new TitleExtractingContentHandler(handler,
- metadata));
- parser.parse(source);
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ SAXParser parser = new SAXParser();
+ parser.setContentHandler(
+ new TitleExtractingContentHandler(handler, metadata));
+ parser.parse(new InputSource(Utils.getUTF8Reader(
+ new CloseShieldInputStream(stream), metadata)));
}
private static class TitleExtractingContentHandler extends
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Mar 9 08:28:24 2008
@@ -52,6 +52,31 @@
DocumentSummaryInformation.DEFAULT_STREAM_NAME;
/**
+ * Extracts properties from an MS Document input stream
+ */
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+ Iterator<?> entries = filesystem.getRoot().getEntries();
+ while (entries.hasNext()) {
+ Entry entry = (Entry) entries.next();
+ String name = entry.getName();
+ if (!(entry instanceof DocumentEntry)) {
+ // Skip directory entries
+ } else if (SUMMARY_INFORMATION.equals(name)
+ || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
+ parse((DocumentEntry) entry, metadata);
+ } else if ("WordDocument".equals(name)) {
+ setType(metadata, "application/msword");
+ } else if ("PowerPoint Document".equals(name)) {
+ setType(metadata, "application/vnd.ms-powerpoint");
+ } else if ("Workbook".equals(name)) {
+ setType(metadata, "application/vnd.ms-excel");
+ }
+ }
+ }
+
+ /**
* Extracts properties and text from an MS Document input stream
*/
public void parse(
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Sun Mar 9 08:28:24 2008
@@ -28,7 +28,6 @@
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
/**
* OpenOffice parser
@@ -55,6 +54,21 @@
this.content = content;
}
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ ZipInputStream zip = new ZipInputStream(stream);
+ ZipEntry entry = zip.getNextEntry();
+ while (entry != null) {
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, "UTF-8");
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals("meta.xml")) {
+ meta.parse(zip, metadata);
+ }
+ entry = zip.getNextEntry();
+ }
+ }
+
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
@@ -65,7 +79,7 @@
String type = IOUtils.toString(zip, "UTF-8");
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals("meta.xml")) {
- meta.parse(zip, new DefaultHandler(), metadata);
+ meta.parse(zip, metadata);
} else if (entry.getName().equals("content.xml")) {
content.parse(zip, handler, metadata);
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Sun Mar 9 08:28:24 2008
@@ -34,6 +34,24 @@
*/
public class PDFParser implements Parser {
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ PDDocument pdfDocument = PDDocument.load(stream);
+ try {
+ if (pdfDocument.isEncrypted()) {
+ try {
+ pdfDocument.decrypt("");
+ } catch (Exception e) {
+ // Ignore
+ }
+ }
+ metadata.add(Metadata.CONTENT_TYPE, "application/pdf");
+ extractMetadata(pdfDocument, metadata);
+ } finally {
+ pdfDocument.close();
+ }
+ }
+
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Sun Mar 9 08:28:24 2008
@@ -25,7 +25,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -33,7 +33,7 @@
/**
* RTF parser
*/
-public class RTFParser implements Parser {
+public class RTFParser extends AbstractParser {
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Mar 9 08:28:24 2008
@@ -22,7 +22,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.Utils;
import org.xml.sax.ContentHandler;
@@ -31,12 +31,11 @@
/**
* Text parser
*/
-public class TXTParser implements Parser {
+public class TXTParser extends AbstractParser {
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
-
Reader reader = Utils.getUTF8Reader(stream, metadata);
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Mar 9 08:28:24 2008
@@ -26,7 +26,7 @@
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
@@ -36,7 +36,7 @@
/**
* XML parser
*/
-public class XMLParser implements Parser {
+public class XMLParser extends AbstractParser {
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)