You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/30 12:51:47 UTC

svn commit: r1003003 - /tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java

Author: nick
Date: Thu Sep 30 10:51:47 2010
New Revision: 1003003

URL: http://svn.apache.org/viewvc?rev=1003003&view=rev
Log:
TIKA-519 - Display embedded images in the GUI Formatted Text pane where they occur in the document.
Applies updated patch from TIKA-519 as discussed

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1003003&r1=1003002&r2=1003003&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Sep 30 10:51:47 2010
@@ -17,12 +17,17 @@
 package org.apache.tika.gui;
 
 import java.awt.Dimension;
+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
 
 import javax.swing.JEditorPane;
 import javax.swing.JFrame;
@@ -38,7 +43,11 @@ import javax.xml.transform.sax.SAXTransf
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -50,6 +59,7 @@ import org.apache.tika.sax.XHTMLContentH
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 /**
  * Simple Swing GUI for Apache Tika. You can drag and drop files on top
@@ -88,6 +98,11 @@ public class TikaGUI extends JFrame {
      * Configured parser instance.
      */
     private final Parser parser;
+    
+    /**
+     * Captures requested embedded images
+     */
+    private final ImageSavingParser imageParser;
 
     /**
      * Tabs in the Tika GUI window.
@@ -143,7 +158,10 @@ public class TikaGUI extends JFrame {
 
         this.context = new ParseContext();
         this.parser = parser;
-        this.context.set(Parser.class, parser);
+        
+        this.imageParser = new ImageSavingParser(parser);
+        this.context.set(DocumentSelector.class, new ImageDocumentSelector());
+        this.context.set(Parser.class, imageParser);
     }
 
    public void importStream(InputStream input, Metadata md)
@@ -160,6 +178,8 @@ public class TikaGUI extends JFrame {
                     getTextContentHandler(textBuffer),
                     getTextMainContentHandler(textMainBuffer),
                     getXmlContentHandler(xmlBuffer));
+            
+            context.set(DocumentSelector.class, new ImageDocumentSelector());
 
             input = new ProgressMonitorInputStream(
                     this, "Parsing stream", input);
@@ -229,6 +249,9 @@ public class TikaGUI extends JFrame {
      * generating a <META> content type tag that makes
      * {@link JEditorPane} fail thinking that the document character set
      * is inconsistent.
+     * <p>
+     * Additionally, it will use ImageSavingParser to re-write embedded:(image) 
+     * image links to be file:///(temporary file) so that they can be loaded.
      *
      * @param writer output writer
      * @return HTML content handler
@@ -250,7 +273,34 @@ public class TikaGUI extends JFrame {
                     uri = null;
                 }
                 if (!"head".equals(localName)) {
-                    super.startElement(uri, localName, name, atts);
+                    if("img".equals(localName)) {
+                       AttributesImpl newAttrs;
+                       if(atts instanceof AttributesImpl) {
+                          newAttrs = (AttributesImpl)atts;
+                       } else {
+                          newAttrs = new AttributesImpl(atts);
+                       }
+                       
+                       for(int i=0; i<newAttrs.getLength(); i++) {
+                          if("src".equals(newAttrs.getLocalName(i))) {
+                             String src = newAttrs.getValue(i);
+                             if(src.startsWith("embedded:")) {
+                                String filename = src.substring(src.indexOf(':')+1);
+                                try {
+                                   File img = imageParser.requestSave(filename);
+                                   String newSrc = img.toURI().toString();
+                                   newAttrs.setValue(i, newSrc);
+                                } catch(IOException e) {
+                                   System.err.println("Error creating temp image file " + filename);
+                                   // The html viewer will show a broken image too to alert them
+                                }
+                             }
+                          }
+                       }
+                       super.startElement(uri, localName, name, newAttrs);
+                    } else {
+                       super.startElement(uri, localName, name, atts);
+                    }
                 }
             }
             @Override
@@ -289,4 +339,65 @@ public class TikaGUI extends JFrame {
         return handler;
     }
 
+    /**
+     * A {@link DocumentSelector} that accepts only images.
+     */
+    private static class ImageDocumentSelector implements DocumentSelector {
+      public boolean select(Metadata metadata) {
+         String type = metadata.get(Metadata.CONTENT_TYPE);
+         return type != null && type.startsWith("image/");
+      }
+    }
+    
+    /**
+     * A recursive parser that saves certain images into the temporary
+     *  directory, and delegates everything else to another downstream
+     *  parser.
+     */
+    private static class ImageSavingParser implements Parser {
+      private Map<String,File> wanted = new HashMap<String,File>();
+      private Parser downstreamParser;
+      private File tmpDir;
+      
+      private ImageSavingParser(Parser downstreamParser) {
+         this.downstreamParser = downstreamParser;
+         
+         try {
+            File t = File.createTempFile("tika", ".test");
+            tmpDir = t.getParentFile();
+         } catch(IOException e) {}
+      }
+      
+      public File requestSave(String embeddedName) throws IOException {
+         String suffix = embeddedName.substring(embeddedName.lastIndexOf('.'));
+         File tmp = File.createTempFile("tika-embedded-", suffix);
+         wanted.put(embeddedName, tmp);
+         return tmp;
+      }
+      
+      public Set<MediaType> getSupportedTypes(ParseContext context) {
+         // Never used in an auto setup
+         return null;
+      }
+
+      public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+         String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+         if(name != null && wanted.containsKey(name)) {
+            FileOutputStream out = new FileOutputStream(wanted.get(name));
+            IOUtils.copy(stream, out);
+            out.close();
+         } else {
+            if(downstreamParser != null) {
+               downstreamParser.parse(stream, handler, metadata, context);
+            }
+         }
+      }
+
+      public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException {
+         parse(stream, handler, metadata, new ParseContext());
+      }
+    }
 }