You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/01/08 17:44:57 UTC

svn commit: r897258 - in /poi/trunk/src: documentation/content/xdocs/ ooxml/java/org/apache/poi/extractor/ ooxml/testcases/org/apache/poi/extractor/ scratchpad/src/org/apache/poi/hsmf/extractor/

Author: nick
Date: Fri Jan  8 16:44:08 2010
New Revision: 897258

URL: http://svn.apache.org/viewvc?rev=897258&view=rev
Log:
Add embeded (attachment) support to the outlook text extractor

Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Jan  8 16:44:08 2010
@@ -34,7 +34,8 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
-           <action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
+           <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
            <action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
            <action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
            <action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted()  </action>

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Fri Jan  8 16:44:08 2010
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.extractor;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -31,6 +32,8 @@
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
@@ -139,9 +142,14 @@
 			if(entry.getName().equals("VisioDocument")) {
 				return new VisioTextExtractor(poifsDir, fs);
 			}
-			if(entry.getName().equals("__substg1.0_1000001E") ||
+			if(
+			      entry.getName().equals("__substg1.0_1000001E") ||
+               entry.getName().equals("__substg1.0_1000001F") ||
 			      entry.getName().equals("__substg1.0_0047001E") ||
-			      entry.getName().equals("__substg1.0_0037001E")) {
+               entry.getName().equals("__substg1.0_0047001F") ||
+			      entry.getName().equals("__substg1.0_0037001E") ||
+               entry.getName().equals("__substg1.0_0037001F")
+			) {
 			   return new OutlookTextExtactor(poifsDir, fs);
 			}
 		}
@@ -157,8 +165,12 @@
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
-		// Find all the embeded directories
+	   // All the embded directories we spotted
 		ArrayList<Entry> dirs = new ArrayList<Entry>();
+		// For anything else not directly held in as a POIFS directory
+		ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+		
+      // Find all the embeded directories
 		POIFSFileSystem fs = ext.getFileSystem();
 		if(fs == null) {
 			throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
@@ -189,20 +201,44 @@
 		} else if(ext instanceof PowerPointExtractor) {
 			// Tricky, not stored directly in poifs
 			// TODO
+		} else if(ext instanceof OutlookTextExtactor) {
+		   // Stored in the Attachment blocks
+		   MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
+		   for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
+		      if(attachment.attachData != null) {
+   		      byte[] data = attachment.attachData.getValue();
+   		      nonPOIFS.add( new ByteArrayInputStream(data) );
+		      }
+		   }
 		}
 		
 		// Create the extractors
-		if(dirs == null || dirs.size() == 0) {
+		if(
+		      (dirs == null || dirs.size() == 0) &&
+		      (nonPOIFS == null || nonPOIFS.size() == 0)
+		){
 			return new POITextExtractor[0];
 		}
 		
-		POITextExtractor[] te = new POITextExtractor[dirs.size()];
-		for(int i=0; i<te.length; i++) {
-			te[i] = createExtractor(
+		ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+		for(int i=0; i<dirs.size(); i++) {
+			e.add( createExtractor(
 					(DirectoryNode)dirs.get(i), ext.getFileSystem()
-			);
+			) );
+		}
+		for(int i=0; i<nonPOIFS.size(); i++) {
+		   try {
+		      e.add( createExtractor(nonPOIFS.get(i)) );
+         } catch(IllegalArgumentException ie) {
+            // Ignore, just means it didn't contain
+            //  a format we support as yet
+		   } catch(XmlException xe) {
+		      throw new IOException(xe.getMessage());
+		   } catch(OpenXML4JException oe) {
+		      throw new IOException(oe.getMessage());
+		   }
 		}
-		return te;
+		return e.toArray(new POITextExtractor[e.size()]);
 	}
 
 	/**

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Fri Jan  8 16:44:08 2010
@@ -59,6 +59,8 @@
    private File pptx;
 
    private File msg;
+   private File msgEmb;
+   
    private File vsd;
 
    protected void setUp() throws Exception {
@@ -86,6 +88,7 @@
       
       POIDataSamples olTests = POIDataSamples.getHSMFInstance();
       msg = olTests.getFile("quick.msg");
+      msgEmb = olTests.getFile("attachment_test_msg.msg");
    }
 
    public void testFile() throws Exception {
@@ -404,9 +407,25 @@
       assertEquals(1, numPpt);
       assertEquals(2, numXls);
       assertEquals(1, numWord);
+      
+      // Outlook
+      ext = (OutlookTextExtactor)
+      ExtractorFactory.createExtractor(msgEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0;
+      assertEquals(1, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(0, numPpt);
+      assertEquals(0, numXls);
+      assertEquals(1, numWord);
 
       // TODO - PowerPoint
       // TODO - Visio
-      // TODO - Outlook
    }
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java Fri Jan  8 16:44:08 2010
@@ -45,6 +45,13 @@
    }
 
    /**
+    * Returns the underlying MAPI message
+    */
+   public MAPIMessage getMAPIMessage() {
+      return (MAPIMessage)document;
+   }
+   
+   /**
     * Outputs something a little like a RFC822 email
     */
    public String getText() {



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org