You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/09/17 15:46:11 UTC
svn commit: r998131 - in /poi/trunk: src/documentation/content/xdocs/
src/scratchpad/src/org/apache/poi/hwpf/extractor/
src/scratchpad/src/org/apache/poi/hwpf/model/
src/scratchpad/src/org/apache/poi/hwpf/sprm/
src/scratchpad/testcases/org/apache/poi/h...
Author: nick
Date: Fri Sep 17 13:46:11 2010
New Revision: 998131
URL: http://svn.apache.org/viewvc?rev=998131&view=rev
Log:
Fix support for sections in old word 6 / word 95 files
Improve unit testing for HWPFOldDocument
Sprm fix also improves some HWPFDocument files too!
Added:
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
poi/trunk/test-data/document/Word6_sections.doc (with props)
Modified:
poi/trunk/src/documentation/content/xdocs/status.xml
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Sep 17 13:46:11 2010
@@ -34,6 +34,7 @@
<changes>
<release version="3.7-beta3" date="2010-??-??">
+ <action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
<action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
<action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
<action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java Fri Sep 17 13:46:11 2010
@@ -245,22 +245,22 @@ public final class WordExtractor extends
* but slightly slower than getTextFromPieces().
*/
public String getText() {
- StringBuffer ret = new StringBuffer();
+ StringBuffer ret = new StringBuffer();
- ret.append(getHeaderText());
+ ret.append(getHeaderText());
- ArrayList<String> text = new ArrayList<String>();
- text.addAll(Arrays.asList(getParagraphText()));
- text.addAll(Arrays.asList(getFootnoteText()));
- text.addAll(Arrays.asList(getEndnoteText()));
+ ArrayList<String> text = new ArrayList<String>();
+ text.addAll(Arrays.asList(getParagraphText()));
+ text.addAll(Arrays.asList(getFootnoteText()));
+ text.addAll(Arrays.asList(getEndnoteText()));
- for(String p : text) {
- ret.append(p);
- }
+ for(String p : text) {
+ ret.append(p);
+ }
- ret.append(getFooterText());
+ ret.append(getFooterText());
- return ret.toString();
+ return ret.toString();
}
/**
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java Fri Sep 17 13:46:11 2010
@@ -34,6 +34,7 @@ public final class OldSectionTable exten
TextPieceTable tpt)
{
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+ CharIsBytes charConv = new CharIsBytes(tpt);
int length = sedPlex.length();
@@ -49,7 +50,7 @@ public final class OldSectionTable exten
// check for the optimization
if (fileOffset == 0xffffffff)
{
- _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+ _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
}
else
{
@@ -58,8 +59,32 @@ public final class OldSectionTable exten
byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
- _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+ _sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
}
}
}
+
+ private static class CharIsBytes implements CharIndexTranslator {
+ private TextPieceTable tpt;
+ private CharIsBytes(TextPieceTable tpt) {
+ this.tpt = tpt;
+ }
+
+ public int getCharIndex(int bytePos, int startCP) {
+ return bytePos;
+ }
+ public int getCharIndex(int bytePos) {
+ return bytePos;
+ }
+
+ public boolean isIndexInTable(int bytePos) {
+ return tpt.isIndexInTable(bytePos);
+ }
+ public int lookIndexBackward(int bytePos) {
+ return tpt.lookIndexBackward(bytePos);
+ }
+ public int lookIndexForward(int bytePos) {
+ return tpt.lookIndexForward(bytePos);
+ }
+ }
}
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java Fri Sep 17 13:46:11 2010
@@ -216,6 +216,8 @@ public class TextPieceTable implements C
if (bytePos< pieceStart || bytePos > pieceEnd) {
toAdd = bytesLength;
+ } else if (bytePos > pieceStart && bytePos < pieceEnd) {
+ toAdd = (bytePos - pieceStart);
} else {
toAdd = bytesLength - (pieceEnd - bytePos);
}
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java Fri Sep 17 13:46:11 2010
@@ -37,7 +37,8 @@ public final class SprmIterator
public boolean hasNext()
{
- return _offset < _grpprl.length;
+ // A Sprm is at least 2 bytes long
+ return _offset < (_grpprl.length-1);
}
public SprmOperation next()
Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java Fri Sep 17 13:46:11 2010
@@ -20,22 +20,68 @@ package org.apache.poi.hwpf;
import junit.framework.Test;
import junit.framework.TestSuite;
-import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.extractor.TestWordExtractor;
+import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
+import org.apache.poi.hwpf.model.TestCHPBinTable;
+import org.apache.poi.hwpf.model.TestDocumentProperties;
+import org.apache.poi.hwpf.model.TestFileInformationBlock;
+import org.apache.poi.hwpf.model.TestFontTable;
+import org.apache.poi.hwpf.model.TestListTables;
+import org.apache.poi.hwpf.model.TestPAPBinTable;
+import org.apache.poi.hwpf.model.TestPlexOfCps;
+import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.TestSavedByTable;
+import org.apache.poi.hwpf.model.TestSectionTable;
+import org.apache.poi.hwpf.model.TestStyleSheet;
+import org.apache.poi.hwpf.model.TestTextPieceTable;
+import org.apache.poi.hwpf.usermodel.TestBug46610;
+import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
+import org.apache.poi.hwpf.usermodel.TestHeaderStories;
+import org.apache.poi.hwpf.usermodel.TestPictures;
+import org.apache.poi.hwpf.usermodel.TestProblems;
+import org.apache.poi.hwpf.usermodel.TestRange;
+import org.apache.poi.hwpf.usermodel.TestRangeDelete;
+import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
+import org.apache.poi.hwpf.usermodel.TestRangeProperties;
+import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
+import org.apache.poi.hwpf.usermodel.TestShapes;
public final class AllHWPFTests {
public static Test suite() {
TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
+
+ suite.addTestSuite(TestHWPFPictures.class);
+ suite.addTestSuite(TestHWPFRangeParts.class);
+
+ suite.addTestSuite(TestWordExtractor.class);
+ suite.addTestSuite(TestWordExtractorBugs.class);
+
suite.addTestSuite(TestCHPBinTable.class);
suite.addTestSuite(TestDocumentProperties.class);
suite.addTestSuite(TestFileInformationBlock.class);
suite.addTestSuite(TestFontTable.class);
+ suite.addTestSuite(TestListTables.class);
suite.addTestSuite(TestPAPBinTable.class);
suite.addTestSuite(TestPlexOfCps.class);
+ suite.addTestSuite(TestRevisionMarkAuthorTable.class);
+ suite.addTestSuite(TestSavedByTable.class);
suite.addTestSuite(TestSectionTable.class);
suite.addTestSuite(TestStyleSheet.class);
suite.addTestSuite(TestTextPieceTable.class);
- suite.addTestSuite(TestListTables.class);
+
+ suite.addTestSuite(TestBug46610.class);
+ suite.addTestSuite(TestHeaderStories.class);
+ suite.addTestSuite(TestHWPFOldDocument.class);
+ suite.addTestSuite(TestPictures.class);
+ suite.addTestSuite(TestProblems.class);
+ suite.addTestSuite(TestRange.class);
+ suite.addTestSuite(TestRangeDelete.class);
+ suite.addTestSuite(TestRangeInsertion.class);
+ suite.addTestSuite(TestRangeProperties.class);
+ suite.addTestSuite(TestRangeReplacement.class);
+ suite.addTestSuite(TestShapes.class);
+
return suite;
}
}
Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java Fri Sep 17 13:46:11 2010
@@ -17,6 +17,7 @@
package org.apache.poi.hwpf;
import org.apache.poi.POIDataSamples;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import java.io.*;
@@ -30,6 +31,14 @@ public class HWPFTestDataSamples {
throw new RuntimeException(e);
}
}
+ public static HWPFOldDocument openOldSampleFile(String sampleFileName) {
+ try {
+ InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);
+ return new HWPFOldDocument(new POIFSFileSystem(is));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
/**
* Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back
* from a <tt>ByteArrayInputStream</tt>.<p/>
Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java Fri Sep 17 13:46:11 2010
@@ -52,7 +52,7 @@ public final class TestWordExtractor ext
// Well behaved document
private WordExtractor extractor;
- // Corrupted document - can't do paragraph based stuff
+ // Slightly iffy document
private WordExtractor extractor2;
// A word doc embeded in an excel file
private String filename3;
@@ -93,8 +93,11 @@ public final class TestWordExtractor ext
assertEquals(p_text1[i], text[i]);
}
- // On second one, should fall back
- assertEquals(1, extractor2.getParagraphText().length);
+ // Lots of paragraphs with only a few lines in them
+ assertEquals(24, extractor2.getParagraphText().length);
+ assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
+ assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
+ assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
}
/**
@@ -103,8 +106,11 @@ public final class TestWordExtractor ext
public void testGetText() {
assertEquals(p_text1_block, extractor.getText());
- // On second one, should fall back to text piece
- assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
+ // For the 2nd, should give similar answers for
+ // the two methods, differing only in line endings
+ assertEquals(
+ extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+ extractor2.getText().replaceAll("[\\r\\n]", ""));
}
/**
Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java?rev=998131&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java (added)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java Fri Sep 17 13:46:11 2010
@@ -0,0 +1,122 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.OldFileFormatException;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFTestCase;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+/**
+ * Tests for Word 6 and Word 95 support
+ */
+public final class TestHWPFOldDocument extends HWPFTestCase {
+ /**
+ * Test a simple Word 6 document
+ */
+ public void testWord6() throws Exception {
+ // Can't open as HWPFDocument
+ try {
+ HWPFTestDataSamples.openSampleFile("Word6.doc");
+ fail("Shouldn't be openable");
+ } catch(OldFileFormatException e) {}
+
+ // Open
+ HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
+
+ // Check
+ assertEquals(1, doc.getRange().numSections());
+ assertEquals(1, doc.getRange().numParagraphs());
+ assertEquals(1, doc.getRange().numCharacterRuns());
+
+ assertEquals(
+ "The quick brown fox jumps over the lazy dog\r",
+ doc.getRange().getParagraph(0).text()
+ );
+ }
+
+ /**
+ * Test a simple Word 95 document
+ */
+ public void testWord95() throws Exception {
+ // Can't open as HWPFDocument
+ try {
+ HWPFTestDataSamples.openSampleFile("Word95.doc");
+ fail("Shouldn't be openable");
+ } catch(OldFileFormatException e) {}
+
+ // Open
+ HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
+
+ // Check
+ assertEquals(1, doc.getRange().numSections());
+ assertEquals(7, doc.getRange().numParagraphs());
+
+ assertEquals(
+ "The quick brown fox jumps over the lazy dog\r",
+ doc.getRange().getParagraph(0).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(1).text());
+ assertEquals(
+ "Paragraph 2\r",
+ doc.getRange().getParagraph(2).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(3).text());
+ assertEquals(
+ "Paragraph 3. Has some RED text and some " +
+ "BLUE BOLD text in it.\r",
+ doc.getRange().getParagraph(4).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(5).text());
+ assertEquals(
+ "Last (4th) paragraph.\r",
+ doc.getRange().getParagraph(6).text()
+ );
+
+ assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
+ // Normal, red, normal, blue+bold, normal
+ assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
+ // Normal, superscript for 4th, normal
+ assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
+ }
+
+ /**
+ * Test a word document that has sections,
+ * as well as the usual paragraph stuff.
+ */
+ public void testWord6Sections() throws Exception {
+ HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
+
+ assertEquals(3, doc.getRange().numSections());
+ assertEquals(6, doc.getRange().numParagraphs());
+
+ assertEquals(
+ "This is a test.\r",
+ doc.getRange().getParagraph(0).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(1).text());
+ assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
+ assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
+ assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
+ assertEquals("\r", doc.getRange().getParagraph(5).text());
+ }
+}
Added: poi/trunk/test-data/document/Word6_sections.doc
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/document/Word6_sections.doc?rev=998131&view=auto
==============================================================================
Binary file - no diff available.
Propchange: poi/trunk/test-data/document/Word6_sections.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org