You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ki...@apache.org on 2020/08/13 21:08:25 UTC
svn commit: r1880839 [1/3] - in /poi/trunk/src:
integrationtest/org/apache/poi/stress/ java/org/apache/poi/extractor/
java/org/apache/poi/hpsf/extractor/ java/org/apache/poi/hssf/extractor/
java/org/apache/poi/sl/extractor/ java/org/apache/poi/ss/extra...
Author: kiwiwings
Date: Thu Aug 13 21:08:24 2020
New Revision: 1880839
URL: http://svn.apache.org/viewvc?rev=1880839&view=rev
Log:
#64411 - Provide JigSaw modules
- rework extractors - see bugzilla entry for more information
Added:
poi/trunk/src/java/org/apache/poi/extractor/ExtractorFactory.java
- copied, changed from r1880838, poi/trunk/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
poi/trunk/src/java/org/apache/poi/extractor/ExtractorProvider.java (with props)
poi/trunk/src/java/org/apache/poi/extractor/MainExtractorFactory.java (with props)
poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java
- copied, changed from r1880838, poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java
poi/trunk/src/ooxml/java/org/apache/poi/xslf/extractor/
poi/trunk/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFExtractor.java (with props)
poi/trunk/src/resources/main/META-INF/services/org.apache.poi.extractor.ExtractorProvider
poi/trunk/src/resources/ooxml/META-INF/services/org.apache.poi.extractor.ExtractorProvider
- copied, changed from r1880689, poi/trunk/src/resources/main/META-INF/services/org.apache.poi.ss.usermodel.WorkbookProvider
poi/trunk/src/resources/scratchpad/META-INF/
poi/trunk/src/resources/scratchpad/META-INF/services/
poi/trunk/src/resources/scratchpad/META-INF/services/org.apache.poi.extractor.ExtractorProvider
Removed:
poi/trunk/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java
poi/trunk/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
Modified:
poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
poi/trunk/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java
poi/trunk/src/java/org/apache/poi/extractor/POITextExtractor.java
poi/trunk/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java
poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
poi/trunk/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java
poi/trunk/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java
poi/trunk/src/multimodule/ooxml/java9/module-info.class
poi/trunk/src/multimodule/ooxml/java9/module-info.java
poi/trunk/src/multimodule/ooxml/test9/module-info.class
poi/trunk/src/multimodule/ooxml/test9/module-info.java
poi/trunk/src/multimodule/poi/java9/module-info.class
poi/trunk/src/multimodule/poi/java9/module-info.java
poi/trunk/src/multimodule/poi/test9/module-info.class
poi/trunk/src/multimodule/poi/test9/module-info.java
poi/trunk/src/multimodule/scratchpad/java9/module-info.class
poi/trunk/src/multimodule/scratchpad/java9/module-info.java
poi/trunk/src/multimodule/scratchpad/test9/module-info.class
poi/trunk/src/multimodule/scratchpad/test9/module-info.java
poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/CommandLineTextExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLTextExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFBEventBasedExcelExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/ooxml/TestExtractorFactory.java
poi/trunk/src/ooxml/testcases/org/apache/poi/openxml4j/opc/TestPackage.java
poi/trunk/src/ooxml/testcases/org/apache/poi/poifs/crypt/tests/TestHxxFEncryption.java
poi/trunk/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java
poi/trunk/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java
poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractorUsingFactory.java
poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractorUsingFactory.java
poi/trunk/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java
poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java
poi/trunk/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java
poi/trunk/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java Thu Aug 13 21:08:24 2020
@@ -29,11 +29,11 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.ss.extractor.ExcelExtractor;
import org.apache.poi.util.IOUtils;
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java Thu Aug 13 21:08:24 2020
@@ -23,7 +23,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -37,11 +37,11 @@ public class XSLFFileHandler extends Sli
assertNotNull(slideInner.getPresentation());
assertNotNull(slideInner.getSlideMasterReferences());
assertNotNull(slideInner.getSlideReferences());
-
+
new POIXMLDocumentHandler().handlePOIXMLDocument(slide);
handleSlideShow(slide);
-
+
slideInner.close();
slide.close();
}
@@ -49,11 +49,12 @@ public class XSLFFileHandler extends Sli
@Override
public void handleExtracting(File file) throws Exception {
super.handleExtracting(file);
-
-
+
+
// additionally try the other getText() methods
- try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
+ //noinspection rawtypes
+ try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
Copied: poi/trunk/src/java/org/apache/poi/extractor/ExtractorFactory.java (from r1880838, poi/trunk/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java)
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/extractor/ExtractorFactory.java?p2=poi/trunk/src/java/org/apache/poi/extractor/ExtractorFactory.java&p1=poi/trunk/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java&r1=1880838&r2=1880839&rev=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java (original)
+++ poi/trunk/src/java/org/apache/poi/extractor/ExtractorFactory.java Thu Aug 13 21:08:24 2020
@@ -16,30 +16,33 @@
==================================================================== */
package org.apache.poi.extractor;
-import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME;
-import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
+import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword;
+import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
-import java.lang.reflect.Method;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
+import java.util.ServiceLoader;
+import java.util.stream.StreamSupport;
-import org.apache.poi.hssf.OldExcelFormatException;
-import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
+import org.apache.poi.EmptyFileException;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
* document, and returns it.
- *
+ *
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - for text extractor creation across all formats, use
@@ -49,16 +52,29 @@ import org.apache.poi.util.POILogger;
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@SuppressWarnings({"WeakerAccess", "JavadocReference"})
-public final class OLE2ExtractorFactory {
- private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
-
+public final class ExtractorFactory {
+ private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class);
+
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
- private OLE2ExtractorFactory() {
+
+ private static class Singleton {
+ private static final ExtractorFactory INSTANCE = new ExtractorFactory();
+ }
+
+ private interface ProviderMethod {
+ POITextExtractor create(ExtractorProvider prov) throws IOException;
+ }
+
+ private final List<ExtractorProvider> provider = new ArrayList<>();
+
+
+ private ExtractorFactory() {
+ ServiceLoader.load(ExtractorProvider.class).forEach(provider::add);
}
/**
@@ -110,63 +126,80 @@ public final class OLE2ExtractorFactory
* @return If the current thread should use event based extractors.
*/
public static boolean getPreferEventExtractor() {
- if(allPreferEventExtractors != null) {
- return allPreferEventExtractors;
- }
- return threadPreferEventExtractors.get();
+ return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get();
}
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
- return (T)createExtractor(fs.getRoot());
+ public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+ return createExtractor(fs, getCurrentUserPassword());
}
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
- Class<?> cls = getOOXMLClass();
- if (cls != null) {
- // Use Reflection to get us the full OOXML-enabled version
- try {
- Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
- return (T)m.invoke(null, input);
- } catch (IllegalArgumentException iae) {
- throw iae;
- } catch (Exception e) {
- throw new IllegalArgumentException("Error creating Extractor for InputStream", e);
- }
- } else {
- // Best hope it's OLE2....
- return createExtractor(new POIFSFileSystem(input));
- }
+ public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException {
+ return createExtractor(fs.getRoot(), password);
}
- private static Class<?> getOOXMLClass() {
- try {
- return OLE2ExtractorFactory.class.getClassLoader().loadClass(
- "org.apache.poi.extractor.ExtractorFactory"
- );
- } catch (ClassNotFoundException e) {
- LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
- return null;
+ public static POITextExtractor createExtractor(InputStream input) throws IOException {
+ return createExtractor(input, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(InputStream input, String password) throws IOException {
+ final InputStream is = FileMagic.prepareToCheckMagic(input);
+ byte[] emptyFileCheck = new byte[1];
+ is.mark(emptyFileCheck.length);
+ if (is.read(emptyFileCheck) < emptyFileCheck.length) {
+ throw new EmptyFileException();
+ }
+ is.reset();
+
+ final FileMagic fm = FileMagic.valueOf(is);
+ if (FileMagic.OOXML == fm) {
+ return wp(fm, w -> w.create(is, password));
+ }
+
+ if (FileMagic.OLE2 != fm) {
+ throw new IOException("Can't create extractor - unsupported file type: "+fm);
}
+
+ POIFSFileSystem poifs = new POIFSFileSystem(is);
+ boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
+
+ return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
+ }
+
+ public static POITextExtractor createExtractor(File file) throws IOException {
+ return createExtractor(file, getCurrentUserPassword());
}
- private static Class<?> getScratchpadClass() {
+
+ public static POITextExtractor createExtractor(File file, String password) throws IOException {
+ if (file.length() == 0) {
+ throw new EmptyFileException();
+ }
+
+ final FileMagic fm = FileMagic.valueOf(file);
+ if (FileMagic.OOXML == fm) {
+ return wp(fm, w -> w.create(file, password));
+ }
+
+ if (FileMagic.OLE2 != fm) {
+ throw new IOException("Can't create extractor - unsupported file type: "+fm);
+ }
+
+ POIFSFileSystem poifs = new POIFSFileSystem(file, true);
try {
- return OLE2ExtractorFactory.class.getClassLoader().loadClass(
- "org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"
- );
- } catch (ClassNotFoundException e) {
- LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
- throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
+ boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
+ return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
+ } catch (IOException | RuntimeException e) {
+ IOUtils.closeQuietly(poifs);
+ throw e;
}
}
-
+
+
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
*
- * @param poifsDir The {@link DirectoryNode} pointing to a document.
+ * @param root The {@link DirectoryNode} pointing to a document.
*
* @return The resulting {@link POITextExtractor}, an exception is thrown if
* no TextExtractor can be created for some reason.
@@ -176,54 +209,40 @@ public final class OLE2ExtractorFactory
* an unsupported version of Excel.
* @throws IllegalArgumentException If creating the Extractor fails
*/
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
- // Look for certain entries in the stream, to figure it
- // out from
- for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
- if (poifsDir.hasEntry(workbookName)) {
- if (getPreferEventExtractor()) {
- return new EventBasedExcelExtractor(poifsDir);
- }
- return new ExcelExtractor(poifsDir);
- }
- }
- if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
- throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
- + "found. Please call OldExcelExtractor directly for basic text extraction");
- }
-
- // Ask Scratchpad, or fail trying
- Class<?> cls = getScratchpadClass();
- try {
- Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
- POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
- if (ext != null) return ext;
- } catch (IllegalArgumentException iae) {
- throw iae;
- } catch (Exception e) {
- throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
- }
+ public static POITextExtractor createExtractor(DirectoryNode root) throws IOException {
+ return createExtractor(root, getCurrentUserPassword());
+ }
- throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+ public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException {
+ // Encrypted OOXML files go inside OLE2 containers, is this one?
+ if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) {
+ return wp(FileMagic.OOXML, w -> w.create(root, password));
+ } else {
+ return wp(FileMagic.OLE2, w -> w.create(root, password));
+ }
}
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- *
- * @param ext The extractor to look at for embedded documents
- *
- * @return An array of resulting extractors. Empty if no embedded documents are found.
- *
- * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
- * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
- * an unsupported version of Excel.
- * @throws IllegalArgumentException If creating the Extractor fails
- */
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ *
+ * @param ext The extractor to look at for embedded documents
+ *
+ * @return An array of resulting extractors. Empty if no embedded documents are found.
+ *
+ * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
+ * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
+ * an unsupported version of Excel.
+ * @throws IllegalArgumentException If creating the Extractor fails
+ */
+ public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ if (ext == null) {
+ throw new IllegalStateException("extractor must be given");
+ }
+
// All the embedded directories we spotted
List<Entry> dirs = new ArrayList<>();
// For anything else not directly held in as a POIFS directory
@@ -237,22 +256,15 @@ public final class OLE2ExtractorFactory
if(ext instanceof ExcelExtractor) {
// These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while(it.hasNext()) {
- Entry entry = it.next();
- if(entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
+ StreamSupport.stream(root.spliterator(), false)
+ .filter(entry -> entry.getName().startsWith("MBD"))
+ .forEach(dirs::add);
} else {
- // Ask Scratchpad, or fail trying
- Class<?> cls = getScratchpadClass();
- try {
- Method m = cls.getDeclaredMethod(
- "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
- m.invoke(null, ext, dirs, nonPOIFS);
- } catch (Exception e) {
- throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
+ for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
+ if (prov.accepts(FileMagic.OLE2)) {
+ prov.identifyEmbeddedResources(ext, dirs, nonPOIFS);
+ break;
+ }
}
}
@@ -261,19 +273,32 @@ public final class OLE2ExtractorFactory
return new POITextExtractor[0];
}
- ArrayList<POITextExtractor> e = new ArrayList<>();
+ ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
for (Entry dir : dirs) {
- e.add(createExtractor((DirectoryNode) dir
- ));
+ textExtractors.add(createExtractor((DirectoryNode) dir));
}
for (InputStream stream : nonPOIFS) {
try {
- e.add(createExtractor(stream));
- } catch (Exception xe) {
- // Ignore, invalid format
- LOGGER.log(POILogger.WARN, xe);
+ textExtractors.add(createExtractor(stream));
+ } catch (IOException e) {
+ // Ignore, just means it didn't contain a format we support as yet
+ LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
}
}
- return e.toArray(new POITextExtractor[0]);
+ return textExtractors.toArray(new POITextExtractor[0]);
}
+
+ private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException {
+ for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
+ if (prov.accepts(fm)) {
+ POITextExtractor ext = fun.create(prov);
+ if (ext != null) {
+ return ext;
+ }
+ }
+ }
+ throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " +
+ "or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm);
+ }
+
}
Added: poi/trunk/src/java/org/apache/poi/extractor/ExtractorProvider.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/extractor/ExtractorProvider.java?rev=1880839&view=auto
==============================================================================
--- poi/trunk/src/java/org/apache/poi/extractor/ExtractorProvider.java (added)
+++ poi/trunk/src/java/org/apache/poi/extractor/ExtractorProvider.java Thu Aug 13 21:08:24 2020
@@ -0,0 +1,76 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.extractor;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+
+public interface ExtractorProvider {
+ boolean accepts(FileMagic fm);
+
+ /**
+ * Create Extractor via file
+ * @param file the file
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if file can't be read or parsed
+ */
+ POITextExtractor create(File file, String password) throws IOException;
+
+ /**
+ * Create Extractor via InputStream
+ * @param inputStream the stream
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if stream can't be read or parsed
+ */
+ POITextExtractor create(InputStream inputStream, String password) throws IOException;
+
+ /**
+ * Create Extractor from POIFS node
+ * @param poifsDir the node
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if node can't be parsed
+ */
+ POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException;
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ *
+ * @param ext the extractor holding the directory to start parsing
+ * @param dirs a list to be filled with directory references holding embedded
+ * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
+ *
+ * @throws IOException when the format specific extraction fails because of invalid entires
+ */
+ default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
+ throw new IllegalArgumentException("Error checking for Scratchpad embedded resources");
+ }
+
+}
Propchange: poi/trunk/src/java/org/apache/poi/extractor/ExtractorProvider.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: poi/trunk/src/java/org/apache/poi/extractor/MainExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/extractor/MainExtractorFactory.java?rev=1880839&view=auto
==============================================================================
--- poi/trunk/src/java/org/apache/poi/extractor/MainExtractorFactory.java (added)
+++ poi/trunk/src/java/org/apache/poi/extractor/MainExtractorFactory.java Thu Aug 13 21:08:24 2020
@@ -0,0 +1,76 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.extractor;
+
+import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.model.InternalWorkbook;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * ExtractorFactory for HSSF and Old Excel format
+ */
+public class MainExtractorFactory implements ExtractorProvider {
+ @Override
+ public boolean accepts(FileMagic fm) {
+ return FileMagic.OLE2 == fm;
+ }
+
+ @Override
+ public POITextExtractor create(File file, String password) throws IOException {
+ return create(new POIFSFileSystem(file, true).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(InputStream inputStream, String password) throws IOException {
+ return create(new POIFSFileSystem(inputStream).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
+ final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
+ try {
+ Biff8EncryptionKey.setCurrentUserPassword(password);
+
+ // Look for certain entries in the stream, to figure it out from
+ for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
+ if (poifsDir.hasEntry(workbookName)) {
+ return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir);
+ }
+ }
+
+ if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) {
+ return new OldExcelExtractor(poifsDir);
+ }
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(oldPW);
+ }
+
+ return null;
+ }
+}
Propchange: poi/trunk/src/java/org/apache/poi/extractor/MainExtractorFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: poi/trunk/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java Thu Aug 13 21:08:24 2020
@@ -30,55 +30,28 @@ import org.apache.poi.poifs.filesystem.D
* org.apache.poi.[format].extractor .
*
* @see org.apache.poi.hssf.extractor.ExcelExtractor
- * @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
-public abstract class POIOLE2TextExtractor extends POITextExtractor {
- /** The POIDocument that's open */
- protected POIDocument document;
-
- /**
- * Creates a new text extractor for the given document
- *
- * @param document The POIDocument to use in this extractor.
- */
- public POIOLE2TextExtractor(POIDocument document) {
- this.document = document;
-
- // Ensure any underlying resources, such as open files,
- // will get cleaned up if the user calls #close()
- setFilesystem(document);
- }
-
- /**
- * Creates a new text extractor, using the same
- * document as another text extractor. Normally
- * only used by properties extractors.
- *
- * @param otherExtractor the extractor which document to be used
- */
- protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) {
- this.document = otherExtractor.document;
- }
-
+public interface POIOLE2TextExtractor extends POITextExtractor {
/**
* Returns the document information metadata for the document
*
* @return The Document Summary Information or null
* if it could not be read for this document.
*/
- public DocumentSummaryInformation getDocSummaryInformation() {
- return document.getDocumentSummaryInformation();
+ default DocumentSummaryInformation getDocSummaryInformation() {
+ return getDocument().getDocumentSummaryInformation();
}
+
/**
* Returns the summary information metadata for the document.
*
* @return The Summary information for the document or null
* if it could not be read for this document.
*/
- public SummaryInformation getSummaryInformation() {
- return document.getSummaryInformation();
+ default SummaryInformation getSummaryInformation() {
+ return getDocument().getSummaryInformation();
}
/**
@@ -88,7 +61,7 @@ public abstract class POIOLE2TextExtract
* @return an instance of POIExtractor that can extract meta-data.
*/
@Override
- public POITextExtractor getMetadataTextExtractor() {
+ default POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
@@ -97,8 +70,8 @@ public abstract class POIOLE2TextExtract
*
* @return the DirectoryEntry that is associated with the POIDocument of this extractor.
*/
- public DirectoryEntry getRoot() {
- return document.getDirectory();
+ default DirectoryEntry getRoot() {
+ return getDocument().getDirectory();
}
/**
@@ -107,7 +80,5 @@ public abstract class POIOLE2TextExtract
* @return the underlying POIDocument
*/
@Override
- public POIDocument getDocument() {
- return document;
- }
+ POIDocument getDocument();
}
\ No newline at end of file
Modified: poi/trunk/src/java/org/apache/poi/extractor/POITextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/extractor/POITextExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/extractor/POITextExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/extractor/POITextExtractor.java Thu Aug 13 21:08:24 2020
@@ -21,19 +21,16 @@ import java.io.IOException;
/**
* Common Parent for Text Extractors
- * of POI Documents.
+ * of POI Documents.
* You will typically find the implementation of
* a given format's text extractor under
* org.apache.poi.[format].extractor .
- *
+ *
* @see org.apache.poi.hssf.extractor.ExcelExtractor
- * @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
-public abstract class POITextExtractor implements Closeable {
- private Closeable fsToClose;
-
+public interface POITextExtractor extends Closeable {
/**
* Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text
@@ -41,42 +38,50 @@ public abstract class POITextExtractor i
* a specific project for details.
* @return All the text from the document
*/
- public abstract String getText();
-
+ String getText();
+
/**
* Returns another text extractor, which is able to
* output the textual content of the document
* metadata / properties, such as author and title.
- *
+ *
* @return the metadata and text extractor
*/
- public abstract POITextExtractor getMetadataTextExtractor();
+ POITextExtractor getMetadataTextExtractor();
/**
- * Used to ensure file handle cleanup.
- *
- * @param fs filesystem to close
+ * @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be
+ * closed on {@link #close()}
*/
- public void setFilesystem(Closeable fs) {
- fsToClose = fs;
- }
-
+ void setCloseFilesystem(boolean doCloseFilesystem);
+
+ /**
+ * @return {@code true}, if resources/filesystem should be closed on {@link #close()}
+ */
+ boolean isCloseFilesystem();
+
+ /**
+ * @return The underlying resources/filesystem
+ */
+ Closeable getFilesystem();
+
/**
* Allows to free resources of the Extractor as soon as
* it is not needed any more. This may include closing
* open file handles and freeing memory.
- *
+ *
* The Extractor cannot be used after close has been called.
*/
@Override
- public void close() throws IOException {
- if(fsToClose != null) {
- fsToClose.close();
+ default void close() throws IOException {
+ Closeable fs = getFilesystem();
+ if (isCloseFilesystem() && fs != null) {
+ fs.close();
}
}
/**
* @return the processed document
*/
- public abstract Object getDocument();
+ Object getDocument();
}
Modified: poi/trunk/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java Thu Aug 13 21:08:24 2020
@@ -17,9 +17,6 @@
package org.apache.poi.hpsf.extractor;
-import java.io.File;
-import java.io.IOException;
-
import org.apache.poi.POIDocument;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
@@ -37,15 +34,20 @@ import org.apache.poi.poifs.filesystem.P
* build in and custom, returning them in
* textual form.
*/
-public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
+public class HPSFPropertiesExtractor implements POIOLE2TextExtractor {
+ private final POIDocument document;
+ private boolean doCloseFilesystem = true;
+
public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) {
- super(mainExtractor);
+ document = mainExtractor.getDocument();
}
- public HPSFPropertiesExtractor(POIDocument doc) {
- super(doc);
+
+ public HPSFPropertiesExtractor(POIDocument document) {
+ this.document = document;
}
+
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
- super(new HPSFPropertiesOnlyDocument(fs));
+ document = new HPSFPropertiesOnlyDocument(fs);
}
public String getDocumentSummaryInformationText() {
@@ -122,11 +124,11 @@ public class HPSFPropertiesExtractor ext
}
private static String getPropertyValueText(Object val) {
- return (val == null)
+ return (val == null)
? "(not set)"
: PropertySet.getPropertyStringValue(val);
}
-
+
@Override
public boolean equals(Object o) {
return super.equals(o);
@@ -137,12 +139,23 @@ public class HPSFPropertiesExtractor ext
return super.hashCode();
}
- public static void main(String[] args) throws IOException {
- for (String file : args) {
- try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(
- new POIFSFileSystem(new File(file)))) {
- System.out.println(ext.getText());
- }
- }
+ @Override
+ public POIDocument getDocument() {
+ return document;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public POIDocument getFilesystem() {
+ return document;
}
}
Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java Thu Aug 13 21:08:24 2020
@@ -17,6 +17,7 @@
package org.apache.poi.hssf.extractor;
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -37,9 +38,9 @@ import org.apache.poi.hssf.record.LabelR
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
-import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -56,29 +57,31 @@ import org.apache.poi.poifs.filesystem.P
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
* </p>
- *
+ *
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
*/
-public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
- private DirectoryNode _dir;
+public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
+ private final POIFSFileSystem poifs;
+ private final DirectoryNode _dir;
+ private boolean doCloseFilesystem = true;
boolean _includeSheetNames = true;
boolean _formulasNotResults;
- public EventBasedExcelExtractor( DirectoryNode dir )
- {
- super( (POIDocument)null );
+ public EventBasedExcelExtractor(DirectoryNode dir) {
+ poifs = null;
_dir = dir;
}
public EventBasedExcelExtractor(POIFSFileSystem fs) {
- this(fs.getRoot());
- super.setFilesystem(fs);
+ poifs = fs;
+ _dir = fs.getRoot();
}
/**
* Would return the document information metadata for the document,
* if we supported it
*/
+ @Override
public DocumentSummaryInformation getDocSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
}
@@ -86,6 +89,7 @@ public class EventBasedExcelExtractor ex
* Would return the summary information metadata for the document,
* if we supported it
*/
+ @Override
public SummaryInformation getSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
}
@@ -262,4 +266,29 @@ public class EventBasedExcelExtractor ex
}
}
}
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return poifs;
+ }
+
+ @Override
+ public POIDocument getDocument() {
+ return null;
+ }
+
+ @Override
+ public DirectoryEntry getRoot() {
+ return _dir;
+ }
}
Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java Thu Aug 13 21:08:24 2020
@@ -50,12 +50,13 @@ import org.apache.poi.ss.usermodel.Row.M
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
* </p>
- *
+ *
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
*/
-public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
+public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private final HSSFWorkbook _wb;
private final HSSFDataFormatter _formatter;
+ private boolean doCloseFilesystem = true;
private boolean _includeSheetNames = true;
private boolean _shouldEvaluateFormulas = true;
private boolean _includeCellComments;
@@ -63,13 +64,14 @@ public class ExcelExtractor extends POIO
private boolean _includeHeadersFooters = true;
public ExcelExtractor(HSSFWorkbook wb) {
- super(wb);
_wb = wb;
_formatter = new HSSFDataFormatter();
}
+
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
+
public ExcelExtractor(DirectoryNode dir) throws IOException {
this(new HSSFWorkbook(dir, true));
}
@@ -201,9 +203,9 @@ public class ExcelExtractor extends POIO
/**
* Command line extractor.
- *
+ *
* @param args the command line parameters
- *
+ *
* @throws IOException if the file can't be read or contains errors
*/
public static void main(String[] args) throws IOException {
@@ -225,7 +227,7 @@ public class ExcelExtractor extends POIO
try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
HSSFWorkbook wb = new HSSFWorkbook(is);
- ExcelExtractor extractor = new ExcelExtractor(wb);
+ ExcelExtractor extractor = new ExcelExtractor(wb)
) {
extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
@@ -255,7 +257,7 @@ public class ExcelExtractor extends POIO
* Should blank cells be output? Default is to only
* output cells that are present in the file and are
* non-blank.
- *
+ *
* @param includeBlankCells {@code true} if blank cells should be included
*/
public void setIncludeBlankCells(boolean includeBlankCells) {
@@ -411,4 +413,24 @@ public class ExcelExtractor extends POIO
return text.toString();
}
+
+ @Override
+ public HSSFWorkbook getDocument() {
+ return _wb;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public HSSFWorkbook getFilesystem() {
+ return _wb;
+ }
}
Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java Thu Aug 13 21:08:24 2020
@@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.CodepageRecord;
@@ -58,7 +59,7 @@ import org.apache.poi.util.IOUtils;
* by Apache Tika, but not really intended for display to the user.
* </p>
*/
-public class OldExcelExtractor implements Closeable {
+public class OldExcelExtractor implements POITextExtractor {
private final static int FILE_PASS_RECORD_SID = 0x2f;
//arbitrarily selected; may need to increase
@@ -295,24 +296,39 @@ public class OldExcelExtractor implement
}
}
- close();
ris = null;
return text.toString();
}
- @Override
- public void close() {
- // some cases require this close here
- if(toClose != null) {
- IOUtils.closeQuietly(toClose);
- toClose = null;
- }
- }
-
protected void handleNumericCell(StringBuilder text, double value) {
// TODO Need to fetch / use format strings
text.append(value);
text.append('\n');
}
+
+ @Override
+ public POITextExtractor getMetadataTextExtractor() {
+ return null;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return toClose != null;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return toClose;
+ }
+
+ @Override
+ public Object getDocument() {
+ return ris;
+ }
}
Modified: poi/trunk/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java Thu Aug 13 21:08:24 2020
@@ -54,14 +54,14 @@ import org.apache.poi.util.POILogger;
public class SlideShowExtractor<
S extends Shape<S,P>,
P extends TextParagraph<S,P,? extends TextRun>
-> extends POITextExtractor {
+> implements POITextExtractor {
private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class);
// placeholder text for slide numbers
private static final String SLIDE_NUMBER_PH = "â¹#âº";
- private SlideShow<S,P> slideshow;
+ protected final SlideShow<S,P> slideshow;
private boolean slidesByDefault = true;
private boolean notesByDefault;
@@ -69,9 +69,9 @@ public class SlideShowExtractor<
private boolean masterByDefault;
private Predicate<Object> filter = o -> true;
+ private boolean doCloseFilesystem = true;
public SlideShowExtractor(final SlideShow<S,P> slideshow) {
- setFilesystem(slideshow);
this.slideshow = slideshow;
}
@@ -81,8 +81,8 @@ public class SlideShowExtractor<
* @return the opened document
*/
@Override
- public final Object getDocument() {
- return slideshow.getPersistDocument();
+ public SlideShow<S,P> getDocument() {
+ return slideshow;
}
/**
@@ -339,17 +339,17 @@ public class SlideShowExtractor<
return raw;
}
- TextParagraph tp = tr.getParagraph();
- TextShape ps = (tp != null) ? tp.getParentShape() : null;
- Sheet sh = (ps != null) ? ps.getSheet() : null;
- String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : "";
+ TextParagraph<?,?,?> tp = tr.getParagraph();
+ TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null;
+ Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null;
+ String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : "";
return raw.replace(SLIDE_NUMBER_PH, slideNr);
}
private static String replaceTextCap(TextRun tr) {
- final TextParagraph tp = tr.getParagraph();
- final TextShape sh = (tp != null) ? tp.getParentShape() : null;
+ final TextParagraph<?,?,?> tp = tr.getParagraph();
+ final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null;
final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null;
// 0xB acts like cariage return in page titles and like blank in the others
@@ -438,4 +438,19 @@ public class SlideShowExtractor<
(italic == null || tr.isItalic() == italic) &&
(bold == null || tr.isBold() == bold);
}
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public SlideShow<S,P> getFilesystem() {
+ return getDocument();
+ }
}
Modified: poi/trunk/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java Thu Aug 13 21:08:24 2020
@@ -24,39 +24,39 @@ public interface ExcelExtractor {
/**
* Should sheet names be included?
* Default is true
- *
+ *
* @param includeSheetNames {@code true} if the sheet names should be included
*/
- public void setIncludeSheetNames(boolean includeSheetNames);
+ void setIncludeSheetNames(boolean includeSheetNames);
/**
* Should we return the formula itself, and not the result it produces?
* Default is false
- *
+ *
* @param formulasNotResults {@code true} if the formula itself is returned
*/
- public void setFormulasNotResults(boolean formulasNotResults);
+ void setFormulasNotResults(boolean formulasNotResults);
/**
* Should headers and footers be included in the output?
* Default is true
- *
+ *
* @param includeHeadersFooters {@code true} if headers and footers should be included
*/
- public void setIncludeHeadersFooters(boolean includeHeadersFooters);
+ void setIncludeHeadersFooters(boolean includeHeadersFooters);
/**
* Should cell comments be included?
* Default is false
- *
+ *
* @param includeCellComments {@code true} if cell comments should be included
*/
- public void setIncludeCellComments(boolean includeCellComments);
+ void setIncludeCellComments(boolean includeCellComments);
/**
* Retrieves the text contents of the file
- *
+ *
* @return the text contents of the file
*/
- public String getText();
+ String getText();
}
Modified: poi/trunk/src/multimodule/ooxml/java9/module-info.class
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/ooxml/java9/module-info.class?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
Binary files - no diff available.
Modified: poi/trunk/src/multimodule/ooxml/java9/module-info.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/ooxml/java9/module-info.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/multimodule/ooxml/java9/module-info.java (original)
+++ poi/trunk/src/multimodule/ooxml/java9/module-info.java Thu Aug 13 21:08:24 2020
@@ -29,6 +29,7 @@ module org.apache.poi.ooxml {
requires java.security.jgss;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
exports org.apache.poi.xwpf.extractor;
exports org.apache.poi.xwpf.usermodel;
Modified: poi/trunk/src/multimodule/ooxml/test9/module-info.class
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/ooxml/test9/module-info.class?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
Binary files - no diff available.
Modified: poi/trunk/src/multimodule/ooxml/test9/module-info.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/ooxml/test9/module-info.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/multimodule/ooxml/test9/module-info.java (original)
+++ poi/trunk/src/multimodule/ooxml/test9/module-info.java Thu Aug 13 21:08:24 2020
@@ -29,6 +29,7 @@ module org.apache.poi.ooxml {
requires java.security.jgss;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
exports org.apache.poi.xwpf.extractor;
exports org.apache.poi.xwpf.usermodel;
Modified: poi/trunk/src/multimodule/poi/java9/module-info.class
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/poi/java9/module-info.class?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
Binary files - no diff available.
Modified: poi/trunk/src/multimodule/poi/java9/module-info.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/poi/java9/module-info.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/multimodule/poi/java9/module-info.java (original)
+++ poi/trunk/src/multimodule/poi/java9/module-info.java Thu Aug 13 21:08:24 2020
@@ -28,8 +28,12 @@ module org.apache.poi.poi {
requires jdk.unsupported;
uses org.apache.poi.ss.usermodel.WorkbookProvider;
+ uses org.apache.poi.extractor.ExtractorProvider;
+
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
+
exports org.apache.poi;
exports org.apache.poi.common;
Modified: poi/trunk/src/multimodule/poi/test9/module-info.class
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/poi/test9/module-info.class?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
Binary files - no diff available.
Modified: poi/trunk/src/multimodule/poi/test9/module-info.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/poi/test9/module-info.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/multimodule/poi/test9/module-info.java (original)
+++ poi/trunk/src/multimodule/poi/test9/module-info.java Thu Aug 13 21:08:24 2020
@@ -28,8 +28,10 @@ module org.apache.poi.poi {
requires jdk.unsupported;
uses org.apache.poi.ss.usermodel.WorkbookProvider;
+ uses org.apache.poi.extractor.ExtractorProvider;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
exports org.apache.poi;
exports org.apache.poi.common;
Modified: poi/trunk/src/multimodule/scratchpad/java9/module-info.class
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/scratchpad/java9/module-info.class?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
Binary files - no diff available.
Modified: poi/trunk/src/multimodule/scratchpad/java9/module-info.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/scratchpad/java9/module-info.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/multimodule/scratchpad/java9/module-info.java (original)
+++ poi/trunk/src/multimodule/scratchpad/java9/module-info.java Thu Aug 13 21:08:24 2020
@@ -20,6 +20,8 @@ module org.apache.poi.scratchpad {
requires java.desktop;
requires commons.math3;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
+
exports org.apache.poi.hmef;
exports org.apache.poi.hmef.dev;
exports org.apache.poi.hmef.extractor;
Modified: poi/trunk/src/multimodule/scratchpad/test9/module-info.class
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/scratchpad/test9/module-info.class?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
Binary files - no diff available.
Modified: poi/trunk/src/multimodule/scratchpad/test9/module-info.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/multimodule/scratchpad/test9/module-info.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/multimodule/scratchpad/test9/module-info.java (original)
+++ poi/trunk/src/multimodule/scratchpad/test9/module-info.java Thu Aug 13 21:08:24 2020
@@ -20,6 +20,8 @@ module org.apache.poi.scratchpad {
requires java.desktop;
requires commons.math3;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
+
exports org.apache.poi.hmef;
exports org.apache.poi.hmef.dev;
exports org.apache.poi.hmef.extractor;
Modified: poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/CommandLineTextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/CommandLineTextExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/CommandLineTextExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/CommandLineTextExtractor.java Thu Aug 13 21:08:24 2020
@@ -18,15 +18,19 @@ package org.apache.poi.ooxml.extractor;
import java.io.File;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
/**
* A command line wrapper around {@link ExtractorFactory}, useful
* for when debugging.
*/
-public class CommandLineTextExtractor {
+public final class CommandLineTextExtractor {
public static final String DIVIDER = "=======================";
+ private CommandLineTextExtractor() {
+ }
+
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
Copied: poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java (from r1880838, poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java)
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java?p2=poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java&p1=poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java&r1=1880838&r2=1880839&rev=1880839&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java Thu Aug 13 21:08:24 2020
@@ -19,17 +19,12 @@ package org.apache.poi.ooxml.extractor;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
-import java.lang.reflect.Method;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.poi.EncryptedDocumentException;
-import org.apache.poi.extractor.OLE2ExtractorFactory;
-import org.apache.poi.extractor.POIOLE2TextExtractor;
+
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorProvider;
import org.apache.poi.extractor.POITextExtractor;
-import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
@@ -38,19 +33,11 @@ import org.apache.poi.openxml4j.opc.Pack
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
-import org.apache.poi.poifs.filesystem.NotOLE2FileException;
-import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.sl.extractor.SlideShowExtractor;
-import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.NotImplemented;
-import org.apache.poi.util.POILogFactory;
-import org.apache.poi.util.POILogger;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@@ -71,20 +58,20 @@ import org.apache.xmlbeans.XmlException;
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@SuppressWarnings("WeakerAccess")
-public final class ExtractorFactory {
- private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
-
- public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
+public final class POIXMLExtractorFactory implements ExtractorProvider {
+ private static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
- XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
- XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
- XSLFRelation.PRESENTATION_MACRO
+ XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
+ XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
+ XSLFRelation.PRESENTATION_MACRO
};
- private ExtractorFactory() {
+ @Override
+ public boolean accepts(FileMagic fm) {
+ return fm == FileMagic.OOXML;
}
/**
@@ -93,7 +80,7 @@ public final class ExtractorFactory {
* Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
- return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
+ return ExtractorFactory.getThreadPrefersEventExtractors();
}
/**
@@ -102,7 +89,7 @@ public final class ExtractorFactory {
* Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
- return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
+ return ExtractorFactory.getAllThreadsPreferEventExtractors();
}
/**
@@ -110,7 +97,7 @@ public final class ExtractorFactory {
* Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
+ ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
}
/**
@@ -118,7 +105,7 @@ public final class ExtractorFactory {
* If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
+ ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
}
/**
@@ -126,52 +113,54 @@ public final class ExtractorFactory {
* Checks the all-threads one first, then thread specific.
*/
public static boolean getPreferEventExtractor() {
- return OLE2ExtractorFactory.getPreferEventExtractor();
+ return ExtractorFactory.getPreferEventExtractor();
}
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
- POIFSFileSystem fs = null;
+ @Override
+ public POITextExtractor create(File f, String password) throws IOException {
+ if (FileMagic.valueOf(f) != FileMagic.OOXML) {
+ return ExtractorFactory.createExtractor(f, password);
+ }
+
+
+ OPCPackage pkg = null;
try {
- fs = new POIFSFileSystem(f);
- if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
- return (T)createEncryptedOOXMLExtractor(fs);
- }
- POITextExtractor extractor = createExtractor(fs);
- extractor.setFilesystem(fs);
- return (T)extractor;
- } catch (OfficeXmlFileException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
- T t = (T)createExtractor(pkg);
- t.setFilesystem(pkg);
- return t;
- } catch (NotOLE2FileException ne) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
- } catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
+ pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
+ POIXMLTextExtractor ex = create(pkg);
+ if (ex == null) {
+ pkg.revert();
+ }
+ return ex;
+ } catch (InvalidFormatException ife) {
+ throw new IOException(ife);
+ } catch (IOException e) {
+ pkg.revert();
throw e;
}
}
- public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
+ public POITextExtractor create(InputStream inp, String password) throws IOException {
InputStream is = FileMagic.prepareToCheckMagic(inp);
- FileMagic fm = FileMagic.valueOf(is);
+ if (FileMagic.valueOf(is) != FileMagic.OOXML) {
+ return ExtractorFactory.createExtractor(is, password);
+ }
- switch (fm) {
- case OLE2:
- POIFSFileSystem fs = new POIFSFileSystem(is);
- boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
- return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
- case OOXML:
- return createExtractor(OPCPackage.open(is));
- default:
- throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm);
+ OPCPackage pkg = null;
+ try {
+ pkg = OPCPackage.open(is);
+ POIXMLTextExtractor ex = create(pkg);
+ if (ex == null) {
+ pkg.revert();
+ }
+ return ex;
+ } catch (InvalidFormatException e) {
+ throw new IOException(e);
+ } catch (RuntimeException | IOException e) {
+ if (pkg != null) {
+ pkg.revert();
+ }
+ throw e;
}
}
@@ -181,11 +170,9 @@ public final class ExtractorFactory {
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
- * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
- * @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
- public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
+ public POIXMLTextExtractor create(OPCPackage pkg) throws IOException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
@@ -199,8 +186,9 @@ public final class ExtractorFactory {
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (core.size() == 1)
+ if (core.size() == 1) {
return new XDGFVisioExtractor(pkg);
+ }
}
// Should just be a single core document, complain if not
@@ -214,7 +202,7 @@ public final class ExtractorFactory {
// Is it XSSF?
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
+ if (rel.getContentType().equals(contentType)) {
if (getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
@@ -224,21 +212,21 @@ public final class ExtractorFactory {
// Is it XWPF?
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
+ if (rel.getContentType().equals(contentType)) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new SlideShowExtractor<>(new XMLSlideShow(pkg));
+ if (rel.getContentType().equals(contentType)) {
+ return new XSLFExtractor(new XMLSlideShow(pkg));
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
- return new SlideShowExtractor<>(new XMLSlideShow(pkg));
+ return new XSLFExtractor(new XMLSlideShow(pkg));
}
// How about xlsb?
@@ -248,137 +236,46 @@ public final class ExtractorFactory {
}
}
- throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
-
- } catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
+ return null;
+ } catch (IOException e) {
throw e;
+ } catch (Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
+ throw new IOException(e);
}
+ // we used to close (revert()) the package here, but this is the callers responsibility
+ // and we can't reuse the package
}
- public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return createExtractor(fs.getRoot());
+ public POITextExtractor create(POIFSFileSystem fs) throws IOException {
+ return create(fs.getRoot(), Biff8EncryptionKey.getCurrentUserPassword());
}
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
- {
- // First, check for OOXML
- for (String entryName : poifsDir.getEntryNames()) {
- if (entryName.equals("Package")) {
- OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
- return (T)createExtractor(pkg);
+ @Override
+ public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
+ // First, check for plain OOXML package
+ if (poifsDir.hasEntry("Package")) {
+ try (InputStream is = poifsDir.createDocumentInputStream("Package")) {
+ return create(is, password);
}
}
- // If not, ask the OLE2 code to check, with Scratchpad if possible
- return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
- // All the embedded directories we spotted
- ArrayList<Entry> dirs = new ArrayList<>();
- // For anything else not directly held in as a POIFS directory
- ArrayList<InputStream> nonPOIFS = new ArrayList<>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if (root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- // provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it
- if (ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while (it.hasNext()) {
- Entry entry = it.next();
- if (entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else {
- try {
- Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory");
- Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
- m.invoke(null, ext, dirs, nonPOIFS);
- } catch (ReflectiveOperationException e) {
- logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage());
- return new POITextExtractor[0];
- }
- }
-
- // Create the extractors
- if (dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
- for (Entry dir : dirs) {
- textExtractors.add(createExtractor((DirectoryNode) dir));
- }
- for (InputStream nonPOIF : nonPOIFS) {
+ if (poifsDir.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
+ EncryptionInfo ei = new EncryptionInfo(poifsDir);
+ Decryptor dec = ei.getDecryptor();
try {
- textExtractors.add(createExtractor(nonPOIF));
- } catch (IllegalArgumentException e) {
- // Ignore, just means it didn't contain
- // a format we support as yet
- logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
- } catch (XmlException | OpenXML4JException e) {
- throw new IOException(e.getMessage(), e);
+ if (!dec.verifyPassword(password)) {
+ throw new IOException("Invalid password specified");
+ }
+ try (InputStream is = dec.getDataStream(poifsDir)) {
+ return create(is, password);
+ }
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new IOException(e);
}
}
- return textExtractors.toArray(new POITextExtractor[0]);
- }
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- @NotImplemented
- @SuppressWarnings({"UnusedParameters", "UnusedReturnValue"})
- public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
- throw new IllegalStateException("Not yet supported");
- }
-
- private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
- throws IOException {
- String pass = Biff8EncryptionKey.getCurrentUserPassword();
- if (pass == null) {
- pass = Decryptor.DEFAULT_PASSWORD;
- }
-
- EncryptionInfo ei = new EncryptionInfo(fs);
- Decryptor dec = ei.getDecryptor();
- InputStream is = null;
- try {
- if (!dec.verifyPassword(pass)) {
- throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
- }
- is = dec.getDataStream(fs);
- return createExtractor(OPCPackage.open(is));
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new EncryptedDocumentException(e);
- } finally {
- IOUtils.closeQuietly(is);
-
- // also close the POIFSFileSystem here as we read all the data
- // while decrypting
- fs.close();
- }
+ throw new IOException("The OLE2 file neither contained a plain OOXML package node (\"Package\") nor an encrypted one (\"EncryptedPackage\").");
}
}
Modified: poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java?rev=1880839&r1=1880838&r2=1880839&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java Thu Aug 13 21:08:24 2020
@@ -36,9 +36,10 @@ import org.openxmlformats.schemas.office
* content of the OOXML file properties, eg author
* and title.
*/
-public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
-
+public class POIXMLPropertiesTextExtractor implements POIXMLTextExtractor {
+ private final POIXMLDocument doc;
private final DateFormat dateFormat;
+ private boolean doCloseFilesystem = true;
/**
* Creates a new POIXMLPropertiesTextExtractor for the given open document.
@@ -46,7 +47,7 @@ public class POIXMLPropertiesTextExtract
* @param doc the given open document
*/
public POIXMLPropertiesTextExtractor(POIXMLDocument doc) {
- super(doc);
+ this.doc = doc;
DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT);
dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs);
dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC);
@@ -242,7 +243,7 @@ public class POIXMLPropertiesTextExtract
}
/*else if (property.isSetArray()) {
- // TODO Fetch the array values and output
+ // TODO Fetch the array values and output
}
else if (property.isSetVector()) {
// TODO Fetch the vector values and output
@@ -281,4 +282,24 @@ public class POIXMLPropertiesTextExtract
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
}
+
+ @Override
+ public POIXMLDocument getDocument() {
+ return doc;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public POIXMLDocument getFilesystem() {
+ return null;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org