You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/12 13:04:22 UTC
svn commit: r1034359 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/OfficeParser.java
test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Author: maxcom
Date: Fri Nov 12 12:04:22 2010
New Revision: 1034359
URL: http://svn.apache.org/viewvc?rev=1034359&view=rev
Log:
TIKA-549: support for extracting OLE-shapes from PPT
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1034359&r1=1034358&r2=1034359&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri Nov 12 12:04:22 2010
@@ -19,15 +19,10 @@ package org.apache.tika.parser.microsoft
import java.io.IOException;
import java.io.InputStream;
import java.security.GeneralSecurityException;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
+import java.util.*;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
-import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
@@ -187,9 +182,7 @@ public class OfficeParser implements Par
new WordExtractor(context).parse(filesystem, xhtml);
break;
case POWERPOINT:
- PowerPointExtractor powerPointExtractor =
- new PowerPointExtractor(filesystem);
- xhtml.element("p", powerPointExtractor.getText(true, true));
+ new HSLFExtractor(context).parse(filesystem, xhtml);
break;
case WORKBOOK:
Locale locale = context.get(Locale.class, Locale.getDefault());
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1034359&r1=1034358&r2=1034359&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Nov 12 12:04:22 2010
@@ -134,24 +134,34 @@ public class POIContainerExtractionTest
// With recursion, should get the images embedded in the office files too
handler = process("testEXCEL_embeded.xls", extractor, true);
- assertEquals(6, handler.filenames.size());
- assertEquals(6, handler.mediaTypes.size());
+ assertEquals(12, handler.filenames.size());
+ assertEquals(12, handler.mediaTypes.size());
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
assertEquals(null, handler.filenames.get(3));
- assertEquals(null, handler.filenames.get(4));
- assertEquals("image1.png", handler.filenames.get(5));
-
+ assertEquals("1", handler.filenames.get(4));
+ assertEquals(null, handler.filenames.get(5));
+ assertEquals("2", handler.filenames.get(6));
+ assertEquals("image1.png", handler.filenames.get(7));
+ assertEquals("image2.jpg", handler.filenames.get(8));
+ assertEquals("image3.png", handler.filenames.get(9));
+ assertEquals("image1.png", handler.filenames.get(11));
+
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
- assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
- assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .doc
-
-
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // Embedded image
+
// Word with .docx, powerpoint and excel
handler = process("testWORD_embeded.doc", extractor, false);
assertEquals(9, handler.filenames.size());