You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/01/08 17:44:57 UTC
svn commit: r897258 - in /poi/trunk/src: documentation/content/xdocs/
ooxml/java/org/apache/poi/extractor/
ooxml/testcases/org/apache/poi/extractor/
scratchpad/src/org/apache/poi/hsmf/extractor/
Author: nick
Date: Fri Jan 8 16:44:08 2010
New Revision: 897258
URL: http://svn.apache.org/viewvc?rev=897258&view=rev
Log:
Add embeded (attachment) support to the outlook text extractor
Modified:
poi/trunk/src/documentation/content/xdocs/status.xml
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Jan 8 16:44:08 2010
@@ -34,7 +34,8 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
- <action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
+ <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
+ <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Fri Jan 8 16:44:08 2010
@@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.extractor;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@@ -31,6 +32,8 @@
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
@@ -139,9 +142,14 @@
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
- if(entry.getName().equals("__substg1.0_1000001E") ||
+ if(
+ entry.getName().equals("__substg1.0_1000001E") ||
+ entry.getName().equals("__substg1.0_1000001F") ||
entry.getName().equals("__substg1.0_0047001E") ||
- entry.getName().equals("__substg1.0_0037001E")) {
+ entry.getName().equals("__substg1.0_0047001F") ||
+ entry.getName().equals("__substg1.0_0037001E") ||
+ entry.getName().equals("__substg1.0_0037001F")
+ ) {
return new OutlookTextExtactor(poifsDir, fs);
}
}
@@ -157,8 +165,12 @@
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
- // Find all the embeded directories
+ // All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
+ // For anything else not directly held in as a POIFS directory
+ ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+
+ // Find all the embeded directories
POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
@@ -189,20 +201,44 @@
} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
+ } else if(ext instanceof OutlookTextExtactor) {
+ // Stored in the Attachment blocks
+ MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
+ for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ if(attachment.attachData != null) {
+ byte[] data = attachment.attachData.getValue();
+ nonPOIFS.add( new ByteArrayInputStream(data) );
+ }
+ }
}
// Create the extractors
- if(dirs == null || dirs.size() == 0) {
+ if(
+ (dirs == null || dirs.size() == 0) &&
+ (nonPOIFS == null || nonPOIFS.size() == 0)
+ ){
return new POITextExtractor[0];
}
- POITextExtractor[] te = new POITextExtractor[dirs.size()];
- for(int i=0; i<te.length; i++) {
- te[i] = createExtractor(
+ ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+ for(int i=0; i<dirs.size(); i++) {
+ e.add( createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem()
- );
+ ) );
+ }
+ for(int i=0; i<nonPOIFS.size(); i++) {
+ try {
+ e.add( createExtractor(nonPOIFS.get(i)) );
+ } catch(IllegalArgumentException ie) {
+ // Ignore, just means it didn't contain
+ // a format we support as yet
+ } catch(XmlException xe) {
+ throw new IOException(xe.getMessage());
+ } catch(OpenXML4JException oe) {
+ throw new IOException(oe.getMessage());
+ }
}
- return te;
+ return e.toArray(new POITextExtractor[e.size()]);
}
/**
Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Fri Jan 8 16:44:08 2010
@@ -59,6 +59,8 @@
private File pptx;
private File msg;
+ private File msgEmb;
+
private File vsd;
protected void setUp() throws Exception {
@@ -86,6 +88,7 @@
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg");
+ msgEmb = olTests.getFile("attachment_test_msg.msg");
}
public void testFile() throws Exception {
@@ -404,9 +407,25 @@
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
+
+ // Outlook
+ ext = (OutlookTextExtactor)
+ ExtractorFactory.createExtractor(msgEmb);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ numWord = 0; numXls = 0; numPpt = 0;
+ assertEquals(1, embeds.length);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ }
+ assertEquals(0, numPpt);
+ assertEquals(0, numXls);
+ assertEquals(1, numWord);
// TODO - PowerPoint
// TODO - Visio
- // TODO - Outlook
}
}
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java Fri Jan 8 16:44:08 2010
@@ -45,6 +45,13 @@
}
/**
+ * Returns the underlying MAPI message
+ */
+ public MAPIMessage getMAPIMessage() {
+ return (MAPIMessage)document;
+ }
+
+ /**
* Outputs something a little like a RFC822 email
*/
public String getText() {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org