You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/06/30 18:08:11 UTC
svn commit: r959360 - in /poi/trunk/src: documentation/content/xdocs/
ooxml/java/org/apache/poi/extractor/
ooxml/testcases/org/apache/poi/extractor/ scratchpad/src/org/apache/poi/hwpf/
Author: nick
Date: Wed Jun 30 16:08:10 2010
New Revision: 959360
URL: http://svn.apache.org/viewvc?rev=959360&view=rev
Log:
Enable Word6Extractor in ExtractorFactory
Modified:
poi/trunk/src/documentation/content/xdocs/status.xml
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=959360&r1=959359&r2=959360&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Wed Jun 30 16:08:10 2010
@@ -34,6 +34,7 @@
<changes>
<release version="3.7-beta2" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
<action dev="POI-DEVELOPERS" type="fix">XSLFSlideShow shouldn't break on .thmx (theme) files. Support for them is still very limited though</action>
Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=959360&r1=959359&r2=959360&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Wed Jun 30 16:08:10 2010
@@ -38,6 +38,8 @@ import org.apache.poi.hsmf.datatypes.Att
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
@@ -218,7 +220,12 @@ public class ExtractorFactory {
}
}
if(entry.getName().equals("WordDocument")) {
- return new WordExtractor(poifsDir, fs);
+ // Old or new style word document?
+ try {
+ return new WordExtractor(poifsDir, fs);
+ } catch(OldWordFileFormatException e) {
+ return new Word6Extractor(poifsDir, fs);
+ }
}
if(entry.getName().equals("PowerPoint Document")) {
return new PowerPointExtractor(poifsDir, fs);
@@ -230,12 +237,12 @@ public class ExtractorFactory {
return new PublisherTextExtractor(poifsDir, fs);
}
if(
- entry.getName().equals("__substg1.0_1000001E") ||
- entry.getName().equals("__substg1.0_1000001F") ||
- entry.getName().equals("__substg1.0_0047001E") ||
- entry.getName().equals("__substg1.0_0047001F") ||
- entry.getName().equals("__substg1.0_0037001E") ||
- entry.getName().equals("__substg1.0_0037001F")
+ entry.getName().equals("__substg1.0_1000001E") ||
+ entry.getName().equals("__substg1.0_1000001F") ||
+ entry.getName().equals("__substg1.0_0047001E") ||
+ entry.getName().equals("__substg1.0_0047001F") ||
+ entry.getName().equals("__substg1.0_0037001E") ||
+ entry.getName().equals("__substg1.0_0037001F")
) {
return new OutlookTextExtactor(poifsDir, fs);
}
Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=959360&r1=959359&r2=959360&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Wed Jun 30 16:08:10 2010
@@ -29,6 +29,7 @@ import org.apache.poi.hslf.extractor.Pow
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
@@ -54,6 +55,8 @@ public class TestExtractorFactory extend
private File xlsEmb;
private File doc;
+ private File doc6;
+ private File doc95;
private File docx;
private File dotx;
private File docEmb;
@@ -79,6 +82,8 @@ public class TestExtractorFactory extend
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
doc = wpTests.getFile("SampleDoc.doc");
+ doc6 = wpTests.getFile("Word6.doc");
+ doc95 = wpTests.getFile("Word95.doc");
docx = wpTests.getFile("SampleDoc.docx");
dotx = wpTests.getFile("test.dotx");
docEmb = wpTests.getFile("word_with_embeded.doc");
@@ -136,6 +141,23 @@ public class TestExtractorFactory extend
);
assertTrue(
+ ExtractorFactory.createExtractor(doc6)
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(doc6).getText().length() > 20
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(doc95)
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(doc95).getText().length() > 120
+ );
+
+
+ assertTrue(
ExtractorFactory.createExtractor(docx)
instanceof XWPFWordExtractor
);
@@ -231,6 +253,22 @@ public class TestExtractorFactory extend
ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120
);
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(doc6))
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(doc95))
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120
+ );
+
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(docx))
instanceof XWPFWordExtractor
@@ -311,6 +349,22 @@ public class TestExtractorFactory extend
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
);
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
+ );
+
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java?rev=959360&r1=959359&r2=959360&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java Wed Jun 30 16:08:10 2010
@@ -169,7 +169,7 @@ public final class HWPFDocument extends
// Is this document too old for us?
if(_fib.getNFib() < 106) {
- throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
+ throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?");
}
// use the fib to determine the name of the table stream.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org