You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/09 16:28:27 UTC

svn commit: r635259 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/parser/html/ src/main/java/org/apache/tika/parser/microsoft/ src/main/java/org/apache/tika/parser/opendocument/ src/main/java/org/apa...

Author: jukka
Date: Sun Mar  9 08:28:24 2008
New Revision: 635259

URL: http://svn.apache.org/viewvc?rev=635259&view=rev
Log:
TIKA-126: Add Parser.parse(InputStream, Metadata) for metadata extraction

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Mar  9 08:28:24 2008
@@ -19,6 +19,9 @@
 
 8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
 
+9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction
+              (Jukka Zitting)
+
 
 Release 0.1-incubating - 12/27/2007
 

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java?rev=635259&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java Sun Mar  9 08:28:24 2008
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Abstract parser base class. Contains a default implementation of the
+ * {@link #parse(InputStream, Metadata)} method.
+ */
+public abstract class AbstractParser implements Parser {
+
+    /**
+     * Calls the full
+     * {@link Parser#parse(InputStream, org.xml.sax.ContentHandler, Metadata)}
+     * method and keeps only the extracted metatdata.
+     */
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        try {
+            parse(stream, new DefaultHandler(), metadata);
+        } catch (SAXException e) {
+            throw new TikaException("Unexpected SAX error", e);
+        }
+    }
+
+}

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Mar  9 08:28:24 2008
@@ -30,7 +30,7 @@
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-public class AutoDetectParser implements Parser {
+public class AutoDetectParser extends AbstractParser {
 
     private TikaConfig config;
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java Sun Mar  9 08:28:24 2008
@@ -16,10 +16,8 @@
  */
 package org.apache.tika.parser;
 
-import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
@@ -32,9 +30,12 @@
  */
 public class EmptyParser implements Parser {
 
+    public void parse(InputStream stream, Metadata metadata) {
+    }
+
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
-            throws IOException, SAXException, TikaException {
+            throws SAXException {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         xhtml.endDocument();

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java Sun Mar  9 08:28:24 2008
@@ -30,6 +30,12 @@
 public class ErrorParser implements Parser {
 
     public void parse(
+            InputStream stream, Metadata metadata)
+            throws TikaException {
+        throw new TikaException("Parse error");
+    }
+
+    public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws TikaException {
         throw new TikaException("Parse error");

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Sun Mar  9 08:28:24 2008
@@ -30,6 +30,20 @@
 public interface Parser {
 
     /**
+     * Parses document metadata from the given document stream.
+     * <p>
+     * The given document stream is consumed but not closed by this method.
+     * The responsibility to close the stream remains on the caller.
+     *
+     * @param stream the document stream (input)
+     * @param metadata document metadata (input and output)
+     * @throws IOException if the document stream could not be read
+     * @throws TikaException if the document could not be parsed
+     */
+    void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException;
+
+    /**
      * Parses a document stream into a sequence of XHTML SAX events.
      * Fills in related document metadata in the given metadata object.
      * <p>

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java Sun Mar  9 08:28:24 2008
@@ -30,7 +30,7 @@
  * instance. Subclasses can provide extra decoration by overriding the
  * parse method.
  */
-public class ParserDecorator implements Parser {
+public class ParserDecorator extends AbstractParser {
 
     /**
      * The decorated parser instance.

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Mar  9 08:28:24 2008
@@ -18,12 +18,13 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.Reader;
 
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.utils.Utils;
 import org.cyberneko.html.parsers.SAXParser;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -33,34 +34,16 @@
 /**
  * Simple HTML parser that extracts title.
  */
-public class HtmlParser implements Parser {
+public class HtmlParser extends AbstractParser {
 
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata) throws IOException, SAXException, TikaException {
-
-        final SAXParser parser = new SAXParser();
-
-        final InputSource source;
-
-        Reader utf8Reader;
-        
-        try {
-            utf8Reader = org.apache.tika.utils.Utils.getUTF8Reader(
-                    stream, metadata);
-        } catch (TikaException ex) {
-            utf8Reader = null;
-        }
-
-        if (utf8Reader == null) {
-            source = new InputSource(stream);
-        } else {
-            source = new InputSource(utf8Reader);
-        }
-
-        
-        parser.setContentHandler(new TitleExtractingContentHandler(handler,
-                metadata));
-        parser.parse(source);
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        SAXParser parser = new SAXParser();
+        parser.setContentHandler(
+                new TitleExtractingContentHandler(handler, metadata));
+        parser.parse(new InputSource(Utils.getUTF8Reader(
+                new CloseShieldInputStream(stream), metadata)));
     }
 
     private static class TitleExtractingContentHandler extends

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Mar  9 08:28:24 2008
@@ -52,6 +52,31 @@
         DocumentSummaryInformation.DEFAULT_STREAM_NAME;
 
     /**
+     * Extracts properties from an MS Document input stream
+     */
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+        Iterator<?> entries = filesystem.getRoot().getEntries();
+        while (entries.hasNext()) {
+            Entry entry = (Entry) entries.next();
+            String name = entry.getName();
+            if (!(entry instanceof DocumentEntry)) {
+                // Skip directory entries
+            } else if (SUMMARY_INFORMATION.equals(name)
+                    || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
+                parse((DocumentEntry) entry, metadata);
+            } else if ("WordDocument".equals(name)) {
+                setType(metadata, "application/msword");
+            } else if ("PowerPoint Document".equals(name)) {
+                setType(metadata, "application/vnd.ms-powerpoint");
+            } else if ("Workbook".equals(name)) {
+                setType(metadata, "application/vnd.ms-excel");
+            }
+        }
+    }
+
+    /**
      * Extracts properties and text from an MS Document input stream
      */
     public void parse(

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Sun Mar  9 08:28:24 2008
@@ -28,7 +28,6 @@
 import org.apache.tika.parser.Parser;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * OpenOffice parser
@@ -55,6 +54,21 @@
         this.content = content;
     }
 
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        ZipInputStream zip = new ZipInputStream(stream);
+        ZipEntry entry = zip.getNextEntry();
+        while (entry != null) {
+            if (entry.getName().equals("mimetype")) {
+                String type = IOUtils.toString(zip, "UTF-8");
+                metadata.set(Metadata.CONTENT_TYPE, type);
+            } else if (entry.getName().equals("meta.xml")) {
+                meta.parse(zip, metadata);
+            }
+            entry = zip.getNextEntry();
+        }
+    }
+
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
@@ -65,7 +79,7 @@
                 String type = IOUtils.toString(zip, "UTF-8");
                 metadata.set(Metadata.CONTENT_TYPE, type);
             } else if (entry.getName().equals("meta.xml")) {
-                meta.parse(zip, new DefaultHandler(), metadata);
+                meta.parse(zip, metadata);
             } else if (entry.getName().equals("content.xml")) {
                 content.parse(zip, handler, metadata);
             }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Sun Mar  9 08:28:24 2008
@@ -34,6 +34,24 @@
  */
 public class PDFParser implements Parser {
 
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        PDDocument pdfDocument = PDDocument.load(stream);
+        try {
+            if (pdfDocument.isEncrypted()) {
+                try {
+                    pdfDocument.decrypt("");
+                } catch (Exception e) {
+                    // Ignore
+                }
+            }
+            metadata.add(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata);
+        } finally {
+            pdfDocument.close();
+        }
+    }
+
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Sun Mar  9 08:28:24 2008
@@ -25,7 +25,7 @@
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -33,7 +33,7 @@
 /**
  * RTF parser
  */
-public class RTFParser implements Parser {
+public class RTFParser extends AbstractParser {
 
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Mar  9 08:28:24 2008
@@ -22,7 +22,7 @@
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.Utils;
 import org.xml.sax.ContentHandler;
@@ -31,12 +31,11 @@
 /**
  * Text parser
  */
-public class TXTParser implements Parser {
+public class TXTParser extends AbstractParser {
 
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
-        
         Reader reader = Utils.getUTF8Reader(stream, metadata);
         metadata.set(Metadata.CONTENT_TYPE, "text/plain");
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=635259&r1=635258&r2=635259&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Mar  9 08:28:24 2008
@@ -26,7 +26,7 @@
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.sax.TextContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
@@ -36,7 +36,7 @@
 /**
  * XML parser
  */
-public class XMLParser implements Parser {
+public class XMLParser extends AbstractParser {
 
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)