You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/12/16 08:39:21 UTC

svn commit: r1049802 - in /poi/trunk: src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java test-data/document/word_with_embeded_ooxml.doc

Author: nick
Date: Thu Dec 16 07:39:21 2010
New Revision: 1049802

URL: http://svn.apache.org/viewvc?rev=1049802&view=rev
Log:
Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them

Added:
    poi/trunk/test-data/document/word_with_embeded_ooxml.doc   (with props)
Modified:
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1049802&r1=1049801&r2=1049802&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Thu Dec 16 07:39:21 2010
@@ -191,10 +191,11 @@ public class ExtractorFactory {
        throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
 	}
 	
-	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
-		return createExtractor(fs.getRoot(), fs);
+	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+	   // Only ever an OLE2 one from the root of the FS
+		return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
 	}
-	public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+	public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 		// Look for certain entries in the stream, to figure it
 		//  out from
 		for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
@@ -234,6 +235,12 @@ public class ExtractorFactory {
 			) {
 			   return new OutlookTextExtactor(poifsDir, fs);
 			}
+			if(entry.getName().equals("Package")) {
+			   OPCPackage pkg = OPCPackage.open(
+			         poifsDir.createDocumentInputStream(entry.getName())
+			   );
+			   return createExtractor(pkg);
+			}
 		}
 		throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
 	}
@@ -246,7 +253,7 @@ public class ExtractorFactory {
 	 *  empty array. Otherwise, you'll get one open 
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
-	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 	   // All the embded directories we spotted
 		ArrayList<Entry> dirs = new ArrayList<Entry>();
 		// For anything else not directly held in as a POIFS directory

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=1049802&r1=1049801&r2=1049802&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Thu Dec 16 07:39:21 2010
@@ -60,6 +60,7 @@ public class TestExtractorFactory extend
    private File docx;
    private File dotx;
    private File docEmb;
+   private File docEmbOOXML;
 
    private File ppt;
    private File pptx;
@@ -88,6 +89,7 @@ public class TestExtractorFactory extend
       docx = wpTests.getFile("SampleDoc.docx");
       dotx = wpTests.getFile("test.dotx");
       docEmb = wpTests.getFile("word_with_embeded.doc");
+      docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
 
       POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
       ppt = slTests.getFile("SampleShow.ppt");
@@ -536,7 +538,7 @@ public class TestExtractorFactory extend
       embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
 
       assertEquals(6, embeds.length);
-      int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
+      int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
       for(int i=0; i<embeds.length; i++) {
          assertTrue(embeds[i].getText().length() > 20);
 
@@ -569,6 +571,27 @@ public class TestExtractorFactory extend
       assertEquals(1, numWord);
       assertEquals(0, numMsg);
       
+      // Word which contains an OOXML file
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(docEmbOOXML);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
+      assertEquals(3, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+         else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+         else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
+      }
+      assertEquals(1, numPpt);
+      assertEquals(1, numXls);
+      assertEquals(0, numWord);
+      assertEquals(1, numWordX);
+      assertEquals(0, numMsg);
+      
       // Outlook
       ext = (OutlookTextExtactor)
       ExtractorFactory.createExtractor(msgEmb);

Added: poi/trunk/test-data/document/word_with_embeded_ooxml.doc
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/document/word_with_embeded_ooxml.doc?rev=1049802&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/word_with_embeded_ooxml.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org