You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/12 13:04:22 UTC

svn commit: r1034359 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/OfficeParser.java test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Author: maxcom
Date: Fri Nov 12 12:04:22 2010
New Revision: 1034359

URL: http://svn.apache.org/viewvc?rev=1034359&view=rev
Log:
TIKA-549: support for extracting OLE-shapes from PPT

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1034359&r1=1034358&r2=1034359&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri Nov 12 12:04:22 2010
@@ -19,15 +19,10 @@ package org.apache.tika.parser.microsoft
 import java.io.IOException;
 import java.io.InputStream;
 import java.security.GeneralSecurityException;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
+import java.util.*;
 
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
-import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.poifs.crypt.Decryptor;
 import org.apache.poi.poifs.crypt.EncryptionInfo;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
@@ -187,9 +182,7 @@ public class OfficeParser implements Par
                     new WordExtractor(context).parse(filesystem, xhtml);
                     break;
                 case POWERPOINT:
-                    PowerPointExtractor powerPointExtractor =
-                        new PowerPointExtractor(filesystem);
-                    xhtml.element("p", powerPointExtractor.getText(true, true));
+                    new HSLFExtractor(context).parse(filesystem, xhtml);
                     break;
                 case WORKBOOK:
                     Locale locale = context.get(Locale.class, Locale.getDefault());

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1034359&r1=1034358&r2=1034359&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Nov 12 12:04:22 2010
@@ -134,24 +134,34 @@ public class POIContainerExtractionTest 
        
        // With recursion, should get the images embedded in the office files too
        handler = process("testEXCEL_embeded.xls", extractor, true);
-       assertEquals(6, handler.filenames.size());
-       assertEquals(6, handler.mediaTypes.size());
+       assertEquals(12, handler.filenames.size());
+       assertEquals(12, handler.mediaTypes.size());
        
        assertEquals(null, handler.filenames.get(0));
        assertEquals(null, handler.filenames.get(1));
        assertEquals(null, handler.filenames.get(2));
        assertEquals(null, handler.filenames.get(3));
-       assertEquals(null, handler.filenames.get(4));
-       assertEquals("image1.png", handler.filenames.get(5));
-       
+       assertEquals("1", handler.filenames.get(4));
+       assertEquals(null, handler.filenames.get(5));
+       assertEquals("2", handler.filenames.get(6));
+       assertEquals("image1.png", handler.filenames.get(7));
+       assertEquals("image2.jpg", handler.filenames.get(8));
+       assertEquals("image3.png", handler.filenames.get(9));
+       assertEquals("image1.png", handler.filenames.get(11));
+
        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); //   PNG inside .doc
-       
-       
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+       assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+       assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // Embedded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // Embedded image
+
        // Word with .docx, powerpoint and excel
        handler = process("testWORD_embeded.doc", extractor, false);
        assertEquals(9, handler.filenames.size());