You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/11 14:31:20 UTC

svn commit: r1033937 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Author: maxcom
Date: Thu Nov 11 13:31:20 2010
New Revision: 1033937

URL: http://svn.apache.org/viewvc?rev=1033937&view=rev
Log:
Extract interface for EmbeddedDocumentExtractor

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java?rev=1033937&r1=1033936&r2=1033937&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java Thu Nov 11 13:31:20 2010
@@ -14,62 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.extractor;
-
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
 
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStream;
+package org.apache.tika.extractor;
 
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.DelegatingParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Helper class for parsers of package archives or other compound document
- * formats that support embedded or attached component documents.
- *
- * @since Apache Tika 0.8
- */
-public class EmbeddedDocumentExtractor {
-
-    private static final File ABSTRACT_PATH = new File("");
-
-    private static final Parser DELEGATING_PARSER = new DelegatingParser();
 
-    private final ParseContext context;
-
-    public EmbeddedDocumentExtractor(ParseContext context) {
-        this.context = context;
-    }
-
-    public boolean shouldParseEmbedded(Metadata metadata) {
-        DocumentSelector selector = context.get(DocumentSelector.class);
-        if (selector != null) {
-            return selector.select(metadata);
-        }
-
-        FilenameFilter filter = context.get(FilenameFilter.class);
-        if (filter != null) {
-            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-            if (name != null) {
-                return filter.accept(ABSTRACT_PATH, name);
-            }
-        }
+import java.io.IOException;
+import java.io.InputStream;
 
-        return true;
-    }
+public interface EmbeddedDocumentExtractor {
+    boolean shouldParseEmbedded(Metadata metadata);
 
     /**
      * Processes the supplied embedded resource, calling the delegating
@@ -78,39 +34,10 @@ public class EmbeddedDocumentExtractor {
      * @param handler The handler to use
      * @param metadata The metadata for the embedded resource
      * @param outputHtml Should we output HTML for this resource, or has the parser already done so?
-     * @throws SAXException
-     * @throws IOException
+     * @throws org.xml.sax.SAXException
+     * @throws java.io.IOException
      */
-    public void parseEmbedded(
+    void parseEmbedded(
             InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
-            throws SAXException, IOException {
-        if(outputHtml) {
-           AttributesImpl attributes = new AttributesImpl();
-           attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
-           handler.startElement(XHTML, "div", "div", attributes);
-        }
-
-        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (name != null && name.length() > 0 && outputHtml) {
-            handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
-            char[] chars = name.toCharArray();
-            handler.characters(chars, 0, chars.length);
-            handler.endElement(XHTML, "h1", "h1");
-        }
-
-        // Use the delegate parser to parse this entry
-        try {
-            DELEGATING_PARSER.parse(
-                    new CloseShieldInputStream(stream),
-                    new EmbeddedContentHandler(new BodyContentHandler(handler)),
-                    metadata, context);
-        } catch (TikaException e) {
-            // Could not parse the entry, just skip the content
-        }
-
-        if(outputHtml) {
-           handler.endElement(XHTML, "div", "div");
-        }
-    }
-
+            throws SAXException, IOException;
 }

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java?rev=1033937&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java Thu Nov 11 13:31:20 2010
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Helper class for parsers of package archives or other compound document
+ * formats that support embedded or attached component documents.
+ *
+ * @since Apache Tika 0.8
+ */
+public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
+
+    private static final File ABSTRACT_PATH = new File("");
+
+    private static final Parser DELEGATING_PARSER = new DelegatingParser();
+
+    private final ParseContext context;
+
+    public ParsingEmbeddedDocumentExtractor(ParseContext context) {
+        this.context = context;
+    }
+
+    public boolean shouldParseEmbedded(Metadata metadata) {
+        DocumentSelector selector = context.get(DocumentSelector.class);
+        if (selector != null) {
+            return selector.select(metadata);
+        }
+
+        FilenameFilter filter = context.get(FilenameFilter.class);
+        if (filter != null) {
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                return filter.accept(ABSTRACT_PATH, name);
+            }
+        }
+
+        return true;
+    }
+
+    public void parseEmbedded(
+            InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
+            throws SAXException, IOException {
+        if(outputHtml) {
+           AttributesImpl attributes = new AttributesImpl();
+           attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
+           handler.startElement(XHTML, "div", "div", attributes);
+        }
+
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name != null && name.length() > 0 && outputHtml) {
+            handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
+            char[] chars = name.toCharArray();
+            handler.characters(chars, 0, chars.length);
+            handler.endElement(XHTML, "h1", "h1");
+        }
+
+        // Use the delegate parser to parse this entry
+        try {
+            DELEGATING_PARSER.parse(
+                    new CloseShieldInputStream(stream),
+                    new EmbeddedContentHandler(new BodyContentHandler(handler)),
+                    metadata, context);
+        } catch (TikaException e) {
+            // Could not parse the entry, just skip the content
+        }
+
+        if(outputHtml) {
+           handler.endElement(XHTML, "div", "div");
+        }
+    }
+
+}

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1033937&r1=1033936&r2=1033937&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Thu Nov 11 13:31:20 2010
@@ -30,6 +30,7 @@ import org.apache.poi.poifs.filesystem.P
 import org.apache.tika.detect.ZipContainerDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -43,7 +44,13 @@ abstract class AbstractPOIFSExtractor {
     private final EmbeddedDocumentExtractor extractor;
 
     protected AbstractPOIFSExtractor(ParseContext context) {
-        this.extractor = new EmbeddedDocumentExtractor(context);
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex==null) {
+            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            this.extractor = ex;
+        }
     }
     
     protected void handleEmbeddedResource(TikaInputStream resource, String filename,

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java?rev=1033937&r1=1033936&r2=1033937&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java Thu Nov 11 13:31:20 2010
@@ -31,6 +31,7 @@ import org.apache.commons.compress.compr
 import org.apache.commons.compress.compressors.gzip.GzipUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -53,7 +54,15 @@ class PackageExtractor {
             ContentHandler handler, Metadata metadata, ParseContext context) {
         this.handler = handler;
         this.metadata = metadata;
-        this.extractor = new EmbeddedDocumentExtractor(context);
+
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex==null) {
+            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            this.extractor = ex;
+        }
+
     }
 
     public void parse(InputStream stream)



Re: svn commit: r1033937 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Posted by Nick Burch <ni...@alfresco.com>.
On Thu, 11 Nov 2010, Maxim Valyanskiy wrote:
> So I need to create JIRA issue before commit?

Yup. If it's a major change, or you're not sure about the route to take, 
post the patch for review on the jira first. If it's a smaller change (eg 
the scope of this one), create the jira before you start coding, then go 
ahead and commit when it's ready. We can then use the jira entry for 
post-commit review and discussions

Or at least that's what I'm currently doing :)

Nick

Re: svn commit: r1033937 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Posted by "Mattmann, Chris A (388J)" <ch...@jpl.nasa.gov>.
BTW: that said, thanks for taking the time to implement this functionality – it looks great and of course I’m +1 for making it easier for you guys to use Tika in your company!

Cheers,
Chris


On 11/11/10 6:38 AM, "Maxim Valyanskiy" <ma...@jet.msk.su> wrote:

Hello!

11.11.2010 17:05, Jukka Zitting пишет:
> Log:
>> Extract interface for EmbeddedDocumentExtractor

We have POI-based utility that extracts all embedded files (attachments, pictures
and etc) from different file formats. This utility takes arbitrary file and
returns ZIP-archive with all attachments.

This utility duplicates functionality of embedded file processing in Tika. I'm
trying to convert my tool from POI to Tika. I think this will make better both my
unpacker utility and Tika.

I needed a way to replace concrete implementation of EmbeddedDocumentExtractor
with my own implementation that copies attachments outside of ContentHandler, so I
splitted interface and implementation of that class.

> It would be good if all non-trivial commit messages contained a
> reference to a relevant issue in Jira for better context of why
> particular changes are being made.
>
> Nick correctly noted earlier that we should write such conventions up
> somewhere. I'll try to find time to draft something for review.
>
So I need to create JIRA issue before commit?

best wishes, Max



++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Senior Computer Scientist
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 171-266B, Mailstop: 171-246
Email: Chris.Mattmann@jpl.nasa.gov
WWW:   http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Assistant Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Re: svn commit: r1033937 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Posted by "Mattmann, Chris A (388J)" <ch...@jpl.nasa.gov>.
Hi Max,

>
> We have POI-based utility that extracts all embedded files (attachments,
> pictures
> and etc) from different file formats. This utility takes arbitrary file and
> returns ZIP-archive with all attachments.
> 
> This utility duplicates functionality of embedded file processing in Tika. I'm
> trying to convert my tool from POI to Tika. I think this will make better both
> my
> unpacker utility and Tika.
> 
> I needed a way to replace concrete implementation of EmbeddedDocumentExtractor
> with my own implementation that copies attachments outside of ContentHandler,
> so I
> splitted interface and implementation of that class.

Perfect! I think that the point is that the above should have gone into a
JIRA issue *before* committing to SVN. That way you can include the relevant
JIRA issue in your log message and there is something to tie it back to,
besides a commit message, or this type of email to an M/L.

> 
>> It would be good if all non-trivial commit messages contained a
>> reference to a relevant issue in Jira for better context of why
>> particular changes are being made.
>> 
>> Nick correctly noted earlier that we should write such conventions up
>> somewhere. I'll try to find time to draft something for review.
>> 
> So I need to create JIRA issue before commit?

Well not all commits are created equal of course. I'd say anything
non-trivial, that is, more than a typo fix, doc update, etc., should require
a JIRA issue before committing it.

Cheers,
Chris

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Senior Computer Scientist
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 171-266B, Mailstop: 171-246
Email: Chris.Mattmann@jpl.nasa.gov
WWW:   http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Assistant Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



Re: svn commit: r1033937 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Posted by Maxim Valyanskiy <ma...@jet.msk.su>.
Hello!

11.11.2010 17:05, Jukka Zitting пишет:
> Log:
>> Extract interface for EmbeddedDocumentExtractor

We have POI-based utility that extracts all embedded files (attachments, pictures 
and etc) from different file formats. This utility takes arbitrary file and 
returns ZIP-archive with all attachments.

This utility duplicates functionality of embedded file processing in Tika. I'm 
trying to convert my tool from POI to Tika. I think this will make better both my 
unpacker utility and Tika.

I needed a way to replace concrete implementation of EmbeddedDocumentExtractor 
with my own implementation that copies attachments outside of ContentHandler, so I 
splitted interface and implementation of that class.

> It would be good if all non-trivial commit messages contained a
> reference to a relevant issue in Jira for better context of why
> particular changes are being made.
>
> Nick correctly noted earlier that we should write such conventions up
> somewhere. I'll try to find time to draft something for review.
>
So I need to create JIRA issue before commit?

best wishes, Max

Re: svn commit: r1033937 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/

Posted by Jukka Zitting <ju...@gmail.com>.
Hi,

On Thu, Nov 11, 2010 at 3:31 PM,  <ma...@apache.org> wrote:
> Log:
> Extract interface for EmbeddedDocumentExtractor

It would be good if all non-trivial commit messages contained a
reference to a relevant issue in Jira for better context of why
particular changes are being made.

Nick correctly noted earlier that we should write such conventions up
somewhere. I'll try to find time to draft something for review.

BR,

Jukka Zitting