You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/09/11 21:55:11 UTC

svn commit: r814005 - in /lucene/tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/main/java/org/apache/tika/parser/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Author: jukka
Date: Fri Sep 11 19:55:08 2009
New Revision: 814005

URL: http://svn.apache.org/viewvc?rev=814005&view=rev
Log:
TIKA-275: Parse context

Update DelegatingParser and the package parsers to use the new context mechanism.

Modify AutoDetectParser and the configuration reader to use the parsing context instead of DelegatingParser.setDelegate() to pass around the delegate parser.

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Fri Sep 11 19:55:08 2009
@@ -30,7 +30,6 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
-import org.apache.tika.parser.DelegatingParser;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.Parser;
 import org.w3c.dom.Document;
@@ -104,10 +103,6 @@
                 Class<?> parserClass = Class.forName(name);
                 Parser parser = (Parser) parserClass.newInstance();
 
-                if (delegate != null && parser instanceof DelegatingParser) {
-                    ((DelegatingParser) parser).setDelegate(delegate);
-                }
-
                 NodeList mimes = node.getElementsByTagName("mime");
                 for (int j = 0; j < mimes.getLength(); j++) {
                     parsers.put(getText(mimes.item(j)).trim(), parser);

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Fri Sep 11 19:55:08 2009
@@ -19,6 +19,8 @@
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
@@ -85,7 +87,8 @@
     }
 
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, SAXException, TikaException {
         // We need buffering to enable MIME magic detection before parsing
         if (!stream.markSupported()) {
@@ -102,7 +105,7 @@
 
         // Parse the document
         try {
-            super.parse(count, secure, metadata);
+            super.parse(count, secure, metadata, context);
         } catch (SAXException e) {
             // Convert zip bomb exceptions to TikaExceptions
             secure.throwIfCauseOf(e);
@@ -110,4 +113,12 @@
         }
     }
 
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        Map<String, Object> context = new HashMap<String, Object>();
+        context.put(Parser.class.getName(), this);
+        parse(stream, handler, metadata, context);
+    }
+
 }

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java Fri Sep 11 19:55:08 2009
@@ -28,62 +28,41 @@
 
 /**
  * Base class for parser implementations that want to delegate parts of the
- * task of parsing an input document to another parser. The default base
- * class implementation simply delegates the entire parsing task to a dummy
- * {@link EmptyParser} instance, but subclasses can implement more complex
- * processing rules and a more complete delegate parser can be specified
- * through the {@link #setDelegate(Parser)} method.
+ * task of parsing an input document to another parser. The delegate parser
+ * is looked up from the parsing context.
  * <p>
- * The Tika configuration mechanism also contains a way to automatically
- * set the delegate parser of all configured delegating parsers
- * implementations. This feature is most notably used by the
- * {@link AutoDetectParser} class to make it the recursive target of all
- * delegated parsing tasks.
+ * This class uses the following parsing context:
+ * <dl>
+ *   <dt>org.apache.tika.parser.Parser</dt>
+ *   <dd>
+ *     The delegate parser ({@link Parser} instance).
+ *   </dd>
+ * </dl>
  *
- * @since Apache Tika 0.4
+ * @since Apache Tika 0.4, major changes in Tika 0.5
  */
 public class DelegatingParser implements Parser {
 
     /**
-     * The parser to which parts of the parsing tasks are delegated.
-     */
-    private transient Parser delegate = new EmptyParser();
-
-    /**
-     * Returns delegate parser instance.
-     *
-     * @return delegate parser
-     */
-    public Parser getDelegate() {
-        return delegate;
-    }
-
-    /**
-     * Sets the delegate parser instance.
-     *
-     * @param delegate delegate parser
-     */
-    public void setDelegate(Parser delegate) {
-        if (delegate == null) {
-            throw new NullPointerException(
-                    "Delegate parser of " + this + " can not be null");
-        } else {
-            this.delegate = delegate;
-        }
-    }
-
-    /**
-     * Parses the given document using the specified delegate parser.
-     * Subclasses should override this method with more complex delegation
-     * rules based on the structure of the input document. The default
-     * implementation simply delegates the entire parsing task to the
-     * specified delegate parser.
+     * Looks up the delegate parser from the parsing context and
+     * delegates the parse operation to it. If a delegate parser is not
+     * found, then an empty XHTML document is returned.
+     * <p>
+     * Subclasses should override this method to parse the top level
+     * structure of the given document stream. Parsed sub-streams can
+     * be passed to this base class method to be parsed by the configured
+     * delegate parser.
      */
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, Map<String, Object> context)
             throws SAXException, IOException, TikaException {
-        delegate.parse(stream, handler, metadata, context);
+        Object parser = context.get(Parser.class.getName());
+        if (parser instanceof Parser) {
+            ((Parser) parser).parse(stream, handler, metadata, context);
+        } else {
+            new EmptyParser().parse(stream, handler, metadata, context);
+        }
     }
 
     /**

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
 import org.apache.tika.exception.TikaException;
@@ -35,14 +36,15 @@
      * Parses the given stream as an ar archive.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, TikaException, SAXException {
         // At the end we want to close the ar stream to release any associated
         // resources, but the underlying document stream should not be closed
         ArArchiveInputStream ar =
             new ArArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            parseArchive(ar, handler, metadata);
+            parseArchive(ar, handler, metadata, context);
         } finally {
             ar.close();
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.tika.exception.TikaException;
@@ -39,7 +40,8 @@
      * Parses the given stream as a bzip2 file.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, SAXException, TikaException {
         metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
 
@@ -70,7 +72,7 @@
                     new CloseShieldInputStream(bzip2),
                     new EmbeddedContentHandler(
                             new BodyContentHandler(xhtml)),
-                    entrydata);
+                    entrydata, context);
         } finally {
             bzip2.close();
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
 import org.apache.tika.exception.TikaException;
@@ -35,14 +36,15 @@
      * Parses the given stream as a cpio file.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, TikaException, SAXException {
         // At the end we want to close the cpio stream to release any associated
         // resources, but the underlying document stream should not be closed
         CpioArchiveInputStream cpio =
             new CpioArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            parseArchive(cpio, handler, metadata);
+            parseArchive(cpio, handler, metadata, context);
         } finally {
             cpio.close();
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 import java.util.zip.GZIPInputStream;
 
 import org.apache.commons.compress.compressors.gzip.GzipUtils;
@@ -40,7 +41,8 @@
      * Parses the given stream as a gzip file.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, SAXException, TikaException {
         metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
 
@@ -64,7 +66,7 @@
                     new CloseShieldInputStream(gzip),
                     new EmbeddedContentHandler(
                             new BodyContentHandler(xhtml)),
-                    entrydata);
+                    entrydata, context);
         } finally {
             gzip.close();
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.commons.compress.archivers.ArchiveEntry;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
@@ -56,7 +57,8 @@
      * @throws SAXException if a SAX error occurs
      */
     protected void parseArchive(
-            ArchiveInputStream archive, ContentHandler handler, Metadata metadata)
+            ArchiveInputStream archive, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, SAXException {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
@@ -77,7 +79,7 @@
                             new CloseShieldInputStream(archive),
                             new EmbeddedContentHandler(
                                     new BodyContentHandler(xhtml)),
-                            entrydata);
+                            entrydata, context);
                 } catch (TikaException e) {
                     // Could not parse the entry, just skip the content
                 }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.tika.exception.TikaException;
@@ -35,7 +36,8 @@
      * Parses the given stream as a tar file.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, TikaException, SAXException {
         metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
 
@@ -44,7 +46,7 @@
         TarArchiveInputStream tar =
             new TarArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            parseArchive(tar, handler, metadata);
+            parseArchive(tar, handler, metadata, context);
         } finally {
             tar.close();
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java?rev=814005&r1=814004&r2=814005&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java Fri Sep 11 19:55:08 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.tika.exception.TikaException;
@@ -35,7 +36,8 @@
      * Parses the given stream as a Zip file.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
             throws IOException, TikaException, SAXException {
         metadata.set(Metadata.CONTENT_TYPE, "application/zip");
 
@@ -44,7 +46,7 @@
         ZipArchiveInputStream zip =
             new ZipArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            parseArchive(zip, handler, metadata);
+            parseArchive(zip, handler, metadata, context);
         } finally {
             zip.close();
         }