You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/01/08 17:14:36 UTC

svn commit: r897246 - in /poi/trunk/src: ooxml/java/org/apache/poi/extractor/ ooxml/testcases/org/apache/poi/extractor/ scratchpad/src/org/apache/poi/hsmf/ scratchpad/src/org/apache/poi/hsmf/extractor/

Author: nick
Date: Fri Jan  8 16:14:27 2010
New Revision: 897246

URL: http://svn.apache.org/viewvc?rev=897246&view=rev
Log:
Wire up the new HSMFTextExtactor to the ExtractorFactory

Modified:
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=897246&r1=897245&r2=897246&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Fri Jan  8 16:14:27 2010
@@ -31,6 +31,7 @@
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -138,6 +139,11 @@
 			if(entry.getName().equals("VisioDocument")) {
 				return new VisioTextExtractor(poifsDir, fs);
 			}
+			if(entry.getName().equals("__substg1.0_1000001E") ||
+			      entry.getName().equals("__substg1.0_0047001E") ||
+			      entry.getName().equals("__substg1.0_0037001E")) {
+			   return new HSMFTextExtactor(poifsDir, fs);
+			}
 		}
 		throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
 	}

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=897246&r1=897245&r2=897246&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Fri Jan  8 16:14:27 2010
@@ -25,6 +25,7 @@
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -42,132 +43,145 @@
  */
 public class TestExtractorFactory extends TestCase {
 
-	private File txt;
-	
-	private File xls;
-	private File xlsx;
-    private File xltx;
-    private File xlsEmb;
-
-	private File doc;
-	private File docx;
-    private File dotx;
-    private File docEmb;
-
-	private File ppt;
-	private File pptx;
-	
-	private File vsd;
-
-	protected void setUp() throws Exception {
-		super.setUp();
-		
-        POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
-        xls = ssTests.getFile("SampleSS.xls");
-		xlsx = ssTests.getFile("SampleSS.xlsx");
-        xltx = ssTests.getFile("test.xltx");
-        xlsEmb = ssTests.getFile("excel_with_embeded.xls");
-
-        POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
-		doc = wpTests.getFile("SampleDoc.doc");
-		docx = wpTests.getFile("SampleDoc.docx");
-        dotx = wpTests.getFile("test.dotx");
-        docEmb = wpTests.getFile("word_with_embeded.doc");
-
-        POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
-		ppt = slTests.getFile("SampleShow.ppt");
-		pptx = slTests.getFile("SampleShow.pptx");
-        txt = slTests.getFile("SampleShow.txt");
-
-        POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
-		vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
-	}
-
-	public void testFile() throws Exception {
-		// Excel
-		assertTrue(
-				ExtractorFactory.createExtractor(xls)
-				instanceof ExcelExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(xls).getText().length() > 200
-		);
-		
-		assertTrue(
-				ExtractorFactory.createExtractor(xlsx)
-				instanceof XSSFExcelExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(xlsx).getText().length() > 200
-		);
-
-                assertTrue(
-                                ExtractorFactory.createExtractor(xltx)
-                                instanceof XSSFExcelExtractor
-                );
-                assertTrue(
-                                ExtractorFactory.createExtractor(xltx).getText().contains("test")
-                );
-
-		
-		// Word
-		assertTrue(
-				ExtractorFactory.createExtractor(doc)
-				instanceof WordExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(doc).getText().length() > 120
-		);
-		
-		assertTrue(
-				ExtractorFactory.createExtractor(docx)
-				instanceof XWPFWordExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(docx).getText().length() > 120
-		);
-
-                assertTrue(
-                                ExtractorFactory.createExtractor(dotx)
-                                instanceof XWPFWordExtractor
-                );
-                assertTrue(
-                                ExtractorFactory.createExtractor(dotx).getText().contains("Test")
-                );
+   private File txt;
 
-		// PowerPoint
-		assertTrue(
-				ExtractorFactory.createExtractor(ppt)
-				instanceof PowerPointExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(ppt).getText().length() > 120
-		);
-		
-		assertTrue(
-				ExtractorFactory.createExtractor(pptx)
-				instanceof XSLFPowerPointExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(pptx).getText().length() > 120
-		);
-		
-		// Visio
-		assertTrue(
-				ExtractorFactory.createExtractor(vsd)
-				instanceof VisioTextExtractor
-		);
-		assertTrue(
-				ExtractorFactory.createExtractor(vsd).getText().length() > 50
-		);
-		
-		// Text
-		try {
-			ExtractorFactory.createExtractor(txt);
-			fail();
-		} catch(IllegalArgumentException e) {
-			// Good
-		}
+   private File xls;
+   private File xlsx;
+   private File xltx;
+   private File xlsEmb;
+
+   private File doc;
+   private File docx;
+   private File dotx;
+   private File docEmb;
+
+   private File ppt;
+   private File pptx;
+
+   private File msg;
+   private File vsd;
+
+   protected void setUp() throws Exception {
+      super.setUp();
+
+      POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
+      xls = ssTests.getFile("SampleSS.xls");
+      xlsx = ssTests.getFile("SampleSS.xlsx");
+      xltx = ssTests.getFile("test.xltx");
+      xlsEmb = ssTests.getFile("excel_with_embeded.xls");
+
+      POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
+      doc = wpTests.getFile("SampleDoc.doc");
+      docx = wpTests.getFile("SampleDoc.docx");
+      dotx = wpTests.getFile("test.dotx");
+      docEmb = wpTests.getFile("word_with_embeded.doc");
+
+      POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
+      ppt = slTests.getFile("SampleShow.ppt");
+      pptx = slTests.getFile("SampleShow.pptx");
+      txt = slTests.getFile("SampleShow.txt");
+
+      POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
+      vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
+      
+      POIDataSamples olTests = POIDataSamples.getHSMFInstance();
+      msg = olTests.getFile("quick.msg");
+   }
+
+   public void testFile() throws Exception {
+      // Excel
+      assertTrue(
+            ExtractorFactory.createExtractor(xls)
+            instanceof ExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(xls).getText().length() > 200
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(xlsx)
+            instanceof XSSFExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(xlsx).getText().length() > 200
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(xltx)
+            instanceof XSSFExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(xltx).getText().contains("test")
+      );
+
+
+      // Word
+      assertTrue(
+            ExtractorFactory.createExtractor(doc)
+            instanceof WordExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(doc).getText().length() > 120
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(docx)
+            instanceof XWPFWordExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(docx).getText().length() > 120
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(dotx)
+            instanceof XWPFWordExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(dotx).getText().contains("Test")
+      );
+
+      // PowerPoint
+      assertTrue(
+            ExtractorFactory.createExtractor(ppt)
+            instanceof PowerPointExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(ppt).getText().length() > 120
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(pptx)
+            instanceof XSLFPowerPointExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(pptx).getText().length() > 120
+      );
+
+      // Visio
+      assertTrue(
+            ExtractorFactory.createExtractor(vsd)
+            instanceof VisioTextExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(vsd).getText().length() > 50
+      );
+      
+      // Outlook msg
+      assertTrue(
+            ExtractorFactory.createExtractor(msg)
+            instanceof HSMFTextExtactor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(msg).getText().length() > 50
+      );
+
+      // Text
+      try {
+         ExtractorFactory.createExtractor(txt);
+         fail();
+      } catch(IllegalArgumentException e) {
+         // Good
+      }
 	}
 	
 	public void testInputStream() throws Exception {
@@ -231,6 +245,15 @@
 				ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
 		);
 		
+		// Outlook msg
+      assertTrue(
+            ExtractorFactory.createExtractor(new FileInputStream(msg))
+            instanceof HSMFTextExtactor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
+      );
+		
 		// Text
 		try {
 			ExtractorFactory.createExtractor(new FileInputStream(txt));
@@ -277,6 +300,15 @@
 				ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
 		);
 		
+      // Outlook msg
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
+            instanceof HSMFTextExtactor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
+      );
+      
 		// Text
 		try {
 			ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
@@ -323,57 +355,58 @@
 		}
 	}
 
-	/**
-	 * Test embeded docs text extraction. For now, only
-	 *  does poifs embeded, but will do ooxml ones 
-	 *  at some point.
-	 */
-	public void testEmbeded() throws Exception {
-		POIOLE2TextExtractor ext;
-		POITextExtractor[] embeds;
-
-		// No embedings
-		ext = (POIOLE2TextExtractor)
-				ExtractorFactory.createExtractor(xls);
-		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-		assertEquals(0, embeds.length);
-		
-		// Excel
-		ext = (POIOLE2TextExtractor)
-				ExtractorFactory.createExtractor(xlsEmb);
-		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
-		assertEquals(6, embeds.length);
-		int numWord = 0, numXls = 0, numPpt = 0;
-        for(int i=0; i<embeds.length; i++) {
-			assertTrue(embeds[i].getText().length() > 20);
-
-            if(embeds[i] instanceof PowerPointExtractor) numPpt++;
-            else if(embeds[i] instanceof ExcelExtractor) numXls++;
-            else if(embeds[i] instanceof WordExtractor) numWord++;
-        }
-		assertEquals(2, numPpt);
-        assertEquals(2, numXls);
-        assertEquals(2, numWord);
-
-        // Word
-		ext = (POIOLE2TextExtractor)
-				ExtractorFactory.createExtractor(docEmb);
-		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-		
-        numWord = 0; numXls = 0; numPpt = 0;
-		assertEquals(4, embeds.length);
-		for(int i=0; i<embeds.length; i++) {
-			assertTrue(embeds[i].getText().length() > 20);
-            if(embeds[i] instanceof PowerPointExtractor) numPpt++;
-            else if(embeds[i] instanceof ExcelExtractor) numXls++;
-            else if(embeds[i] instanceof WordExtractor) numWord++;
-		}
-        assertEquals(1, numPpt);
-        assertEquals(2, numXls);
-        assertEquals(1, numWord);
-
-		// TODO - PowerPoint
-		// TODO - Visio
-	}
+   /**
+    * Test embeded docs text extraction. For now, only
+    *  does poifs embeded, but will do ooxml ones 
+    *  at some point.
+    */
+   public void testEmbeded() throws Exception {
+      POIOLE2TextExtractor ext;
+      POITextExtractor[] embeds;
+
+      // No embedings
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(xls);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+      assertEquals(0, embeds.length);
+
+      // Excel
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(xlsEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      assertEquals(6, embeds.length);
+      int numWord = 0, numXls = 0, numPpt = 0;
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(2, numPpt);
+      assertEquals(2, numXls);
+      assertEquals(2, numWord);
+
+      // Word
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(docEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0;
+      assertEquals(4, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(1, numPpt);
+      assertEquals(2, numXls);
+      assertEquals(1, numWord);
+
+      // TODO - PowerPoint
+      // TODO - Visio
+      // TODO - Outlook
+   }
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java?rev=897246&r1=897245&r2=897246&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java Fri Jan  8 16:14:27 2010
@@ -34,6 +34,7 @@
 import org.apache.poi.hsmf.datatypes.StringChunk;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
 import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -78,15 +79,24 @@
 	   this(new POIFSFileSystem(in));
 	}
    /**
-    * Constructor for reading MSG Files from an input stream.
+    * Constructor for reading MSG Files from a POIFS filesystem
     * @param in
     * @throws IOException
     */
    public MAPIMessage(POIFSFileSystem fs) throws IOException {
-		super(fs);
-		
+		this(fs.getRoot(), fs);
+   }
+   /**
+    * Constructor for reading MSG Files from a certain
+    *  point within a POIFS filesystem
+    * @param in
+    * @throws IOException
+    */
+   public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+      super(poifsDir, fs);
+      
 		// Grab all the chunks
-		ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
+		ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
 		
 		// Grab interesting bits
 		ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java?rev=897246&r1=897245&r2=897246&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java Fri Jan  8 16:14:27 2010
@@ -23,12 +23,16 @@
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 public class HSMFTextExtactor extends POIOLE2TextExtractor {
    public HSMFTextExtactor(MAPIMessage msg) {
       super(msg);
    }
+   public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+      this(new MAPIMessage(poifsDir, fs));
+   }
    public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
       this(new MAPIMessage(fs));
    }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org