You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ce...@apache.org on 2015/02/27 10:59:15 UTC
svn commit: r1662652 - in /poi/trunk/src: integrationtest/org/apache/poi/
integrationtest/org/apache/poi/stress/ ooxml/java/org/apache/poi/extractor/
ooxml/java/org/apache/poi/xssf/extractor/
Author: centic
Date: Fri Feb 27 09:59:14 2015
New Revision: 1662652
URL: http://svn.apache.org/r1662652
Log:
* Add text-extraction verification to integration-tests via a new abstract base FileHandler
* Fix NullPointerException found in some documents when running against the test-data
* Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely formatted XLS files.
Added:
poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
Modified:
poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
Modified: poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java Fri Feb 27 09:59:14 2015
@@ -253,20 +253,26 @@ public class TestAllFiles {
@Test
public void testAllFiles() throws Exception {
assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler);
- InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100);
+ File inputFile = new File(ROOT_DIR, file);
+
try {
- handler.handleFile(stream);
-
- assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
- EXPECTED_FAILURES.contains(file));
- } catch (Exception e) {
- // check if we expect failure for this file
- if(!EXPECTED_FAILURES.contains(file)) {
- throw new Exception("While handling " + file, e);
- }
- } finally {
- stream.close();
- }
+ InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100);
+ try {
+ handler.handleFile(stream);
+
+ assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
+ EXPECTED_FAILURES.contains(file));
+ } finally {
+ stream.close();
+ }
+
+ handler.handleExtracting(inputFile);
+ } catch (Exception e) {
+ // check if we expect failure for this file
+ if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
+ throw new Exception("While handling " + file, e);
+ }
+ }
}
private static String getExtension(String file) {
@@ -282,5 +288,9 @@ public class TestAllFiles {
@Override
public void handleFile(InputStream stream) throws Exception {
}
+
+ @Override
+ public void handleExtracting(File file) throws Exception {
+ }
}
}
Added: poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java?rev=1662652&view=auto
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java (added)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java Fri Feb 27 09:59:14 2015
@@ -0,0 +1,55 @@
+package org.apache.poi.stress;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.File;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+
+public abstract class AbstractFileHandler implements FileHandler {
+ public static final Set<String> EXPECTED_EXTRACTOR_FAILURES = new HashSet<String>();
+ static {
+ // password protected files
+ EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx");
+ EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx");
+ EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx");
+ EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx");
+ EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx");
+
+ // unsupported file-types, no supported OLE2 parts
+ EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat");
+ EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat");
+ EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat");
+ EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat");
+ EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm");
+ EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg");
+ EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2");
+ EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx");
+ }
+
+ public void handleExtracting(File file) throws Exception {
+ POITextExtractor extractor = ExtractorFactory.createExtractor(file);
+ try {
+ assertNotNull(extractor);
+
+ assertNotNull(extractor.getText());
+
+ // also try metadata
+ POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();
+ assertNotNull(metadataExtractor.getText());
+
+ assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!",
+ EXPECTED_EXTRACTOR_FAILURES.contains(file));
+ } catch (IllegalArgumentException e) {
+ if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
+ throw new Exception("While handling " + file, e);
+ }
+ } finally {
+ extractor.close();
+ }
+ }
+}
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java Fri Feb 27 09:59:14 2015
@@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.stress;
+import java.io.File;
import java.io.InputStream;
/**
@@ -34,4 +35,10 @@ public interface FileHandler {
* @throws Exception
*/
void handleFile(InputStream stream) throws Exception;
+
+ /**
+ * Ensures that extracting text from the given file
+ * is returning some text.
+ */
+ void handleExtracting(File file) throws Exception;
}
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java Fri Feb 27 09:59:14 2015
@@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAP
import org.apache.poi.hmef.attribute.MAPIStringAttribute;
import org.junit.Test;
-public class HMEFFileHandler implements FileHandler {
+public class HMEFFileHandler extends AbstractFileHandler {
@Override
public void handleFile(InputStream stream) throws Exception {
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java Fri Feb 27 09:59:14 2015
@@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertie
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.Test;
-public class HPSFFileHandler implements FileHandler {
+public class HPSFFileHandler extends AbstractFileHandler {
@Override
public void handleFile(InputStream stream) throws Exception {
HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream));
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java Fri Feb 27 09:59:14 2015
@@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.stress;
+import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
@@ -49,4 +50,10 @@ public class HSSFFileHandler extends Spr
stream.close();
}
}
+
+ // a test-case to test this locally without executing the full TestAllFiles
+ @Test
+ public void testExtractor() throws Exception {
+ handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls"));
+ }
}
\ No newline at end of file
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java Fri Feb 27 09:59:14 2015
@@ -25,7 +25,7 @@ import java.io.InputStream;
import org.apache.poi.POIDocument;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-public class POIFSFileHandler implements FileHandler {
+public class POIFSFileHandler extends AbstractFileHandler {
@Override
public void handleFile(InputStream stream) throws Exception {
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java Fri Feb 27 09:59:14 2015
@@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
-public abstract class SpreadsheetHandler implements FileHandler {
+public abstract class SpreadsheetHandler extends AbstractFileHandler {
public void handleWorkbook(Workbook wb, String extension) throws IOException {
// try to access some of the content
readContent(wb);
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java Fri Feb 27 09:59:14 2015
@@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCP
import org.apache.poi.xslf.XSLFSlideShow;
import org.junit.Test;
-public class XSLFFileHandler implements FileHandler {
+public class XSLFFileHandler extends AbstractFileHandler {
@Override
public void handleFile(InputStream stream) throws Exception {
// ignore password protected files
Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java Fri Feb 27 09:59:14 2015
@@ -22,7 +22,7 @@ import java.io.InputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
-public class XWPFFileHandler implements FileHandler {
+public class XWPFFileHandler extends AbstractFileHandler {
@Override
public void handleFile(InputStream stream) throws Exception {
// ignore password protected files
Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Fri Feb 27 09:59:14 2015
@@ -213,7 +213,9 @@ public class ExtractorFactory {
{
// Look for certain entries in the stream, to figure it
// out from
- if (poifsDir.hasEntry("Workbook")) {
+ if (poifsDir.hasEntry("Workbook") ||
+ // some XLS files have different entry-names
+ poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) {
if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir);
}
Modified: poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java Fri Feb 27 09:59:14 2015
@@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends
}
POIXMLTextExtractor extractor =
new XSSFExcelExtractor(args[0]);
- System.out.println(extractor.getText());
+ try {
+ System.out.println(extractor.getText());
+ } finally {
+ extractor.close();
+ }
}
/**
@@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends
if (type == Cell.CELL_TYPE_NUMERIC) {
CellStyle cs = cell.getCellStyle();
- if (cs.getDataFormatString() != null) {
+ if (cs != null && cs.getDataFormatString() != null) {
text.append(formatter.formatRawCellContents(
cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString()
));
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org