You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/09/17 15:46:11 UTC

svn commit: r998131 - in /poi/trunk: src/documentation/content/xdocs/ src/scratchpad/src/org/apache/poi/hwpf/extractor/ src/scratchpad/src/org/apache/poi/hwpf/model/ src/scratchpad/src/org/apache/poi/hwpf/sprm/ src/scratchpad/testcases/org/apache/poi/h...

Author: nick
Date: Fri Sep 17 13:46:11 2010
New Revision: 998131

URL: http://svn.apache.org/viewvc?rev=998131&view=rev
Log:
Fix support for sections in old word 6 / word 95 files
Improve unit testing for HWPFOldDocument
Sprm fix also improves some HWPFDocument files too!

Added:
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
    poi/trunk/test-data/document/Word6_sections.doc   (with props)
Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Sep 17 13:46:11 2010
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-beta3" date="2010-??-??">
+           <action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
            <action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
            <action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
            <action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java Fri Sep 17 13:46:11 2010
@@ -245,22 +245,22 @@ public final class WordExtractor extends
 	 *  but slightly slower than getTextFromPieces().
 	 */
 	public String getText() {
-		StringBuffer ret = new StringBuffer();
+	   StringBuffer ret = new StringBuffer();
 
-		ret.append(getHeaderText());
+	   ret.append(getHeaderText());
 
-                ArrayList<String> text = new ArrayList<String>();
-                text.addAll(Arrays.asList(getParagraphText()));
-                text.addAll(Arrays.asList(getFootnoteText()));
-                text.addAll(Arrays.asList(getEndnoteText()));
+	   ArrayList<String> text = new ArrayList<String>();
+	   text.addAll(Arrays.asList(getParagraphText()));
+	   text.addAll(Arrays.asList(getFootnoteText()));
+	   text.addAll(Arrays.asList(getEndnoteText()));
 
-		for(String p : text) {
-			ret.append(p);
-		}
+	   for(String p : text) {
+	      ret.append(p);
+	   }
 
-		ret.append(getFooterText());
+	   ret.append(getFooterText());
 
-		return ret.toString();
+	   return ret.toString();
 	}
 
 	/**

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java Fri Sep 17 13:46:11 2010
@@ -34,6 +34,7 @@ public final class OldSectionTable exten
                       TextPieceTable tpt)
   {
     PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+    CharIsBytes charConv = new CharIsBytes(tpt);
 
     int length = sedPlex.length();
 
@@ -49,7 +50,7 @@ public final class OldSectionTable exten
       // check for the optimization
       if (fileOffset == 0xffffffff)
       {
-        _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+        _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
       }
       else
       {
@@ -58,8 +59,32 @@ public final class OldSectionTable exten
         byte[] buf = new byte[sepxSize];
         fileOffset += LittleEndian.SHORT_SIZE;
         System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
-        _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+        _sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
       }
     }
   }
+  
+  private static class CharIsBytes implements CharIndexTranslator {
+     private TextPieceTable tpt;
+     private CharIsBytes(TextPieceTable tpt) {
+        this.tpt = tpt;
+     }
+
+     public int getCharIndex(int bytePos, int startCP) {
+        return bytePos;
+     }
+     public int getCharIndex(int bytePos) {
+        return bytePos;
+     }
+
+     public boolean isIndexInTable(int bytePos) {
+        return tpt.isIndexInTable(bytePos);
+     }
+     public int lookIndexBackward(int bytePos) {
+        return tpt.lookIndexBackward(bytePos);
+     }
+     public int lookIndexForward(int bytePos) {
+        return tpt.lookIndexForward(bytePos);
+     }
+  }
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java Fri Sep 17 13:46:11 2010
@@ -216,6 +216,8 @@ public class TextPieceTable implements C
 
             if (bytePos< pieceStart || bytePos > pieceEnd) {
                 toAdd = bytesLength;
+            } else if (bytePos > pieceStart && bytePos < pieceEnd) {
+               toAdd = (bytePos - pieceStart);
             } else {
                 toAdd = bytesLength - (pieceEnd - bytePos);
             }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java Fri Sep 17 13:46:11 2010
@@ -37,7 +37,8 @@ public final class SprmIterator
 
   public boolean hasNext()
   {
-    return _offset < _grpprl.length;
+    // A Sprm is at least 2 bytes long
+    return _offset < (_grpprl.length-1);
   }
 
   public SprmOperation next()

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java Fri Sep 17 13:46:11 2010
@@ -20,22 +20,68 @@ package org.apache.poi.hwpf;
 import junit.framework.Test;
 import junit.framework.TestSuite;
 
-import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.extractor.TestWordExtractor;
+import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
+import org.apache.poi.hwpf.model.TestCHPBinTable;
+import org.apache.poi.hwpf.model.TestDocumentProperties;
+import org.apache.poi.hwpf.model.TestFileInformationBlock;
+import org.apache.poi.hwpf.model.TestFontTable;
+import org.apache.poi.hwpf.model.TestListTables;
+import org.apache.poi.hwpf.model.TestPAPBinTable;
+import org.apache.poi.hwpf.model.TestPlexOfCps;
+import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.TestSavedByTable;
+import org.apache.poi.hwpf.model.TestSectionTable;
+import org.apache.poi.hwpf.model.TestStyleSheet;
+import org.apache.poi.hwpf.model.TestTextPieceTable;
+import org.apache.poi.hwpf.usermodel.TestBug46610;
+import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
+import org.apache.poi.hwpf.usermodel.TestHeaderStories;
+import org.apache.poi.hwpf.usermodel.TestPictures;
+import org.apache.poi.hwpf.usermodel.TestProblems;
+import org.apache.poi.hwpf.usermodel.TestRange;
+import org.apache.poi.hwpf.usermodel.TestRangeDelete;
+import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
+import org.apache.poi.hwpf.usermodel.TestRangeProperties;
+import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
+import org.apache.poi.hwpf.usermodel.TestShapes;
 
 public final class AllHWPFTests {
 
 	public static Test suite() {
 		TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
+
+		suite.addTestSuite(TestHWPFPictures.class);
+		suite.addTestSuite(TestHWPFRangeParts.class);
+
+		suite.addTestSuite(TestWordExtractor.class);
+		suite.addTestSuite(TestWordExtractorBugs.class);
+
 		suite.addTestSuite(TestCHPBinTable.class);
 		suite.addTestSuite(TestDocumentProperties.class);
 		suite.addTestSuite(TestFileInformationBlock.class);
 		suite.addTestSuite(TestFontTable.class);
+		suite.addTestSuite(TestListTables.class);
 		suite.addTestSuite(TestPAPBinTable.class);
 		suite.addTestSuite(TestPlexOfCps.class);
+		suite.addTestSuite(TestRevisionMarkAuthorTable.class);
+		suite.addTestSuite(TestSavedByTable.class);
 		suite.addTestSuite(TestSectionTable.class);
 		suite.addTestSuite(TestStyleSheet.class);
 		suite.addTestSuite(TestTextPieceTable.class);
-		suite.addTestSuite(TestListTables.class);
+
+		suite.addTestSuite(TestBug46610.class);
+		suite.addTestSuite(TestHeaderStories.class);
+		suite.addTestSuite(TestHWPFOldDocument.class);
+		suite.addTestSuite(TestPictures.class);
+		suite.addTestSuite(TestProblems.class);
+		suite.addTestSuite(TestRange.class);
+		suite.addTestSuite(TestRangeDelete.class);
+		suite.addTestSuite(TestRangeInsertion.class);
+		suite.addTestSuite(TestRangeProperties.class);
+		suite.addTestSuite(TestRangeReplacement.class);
+		suite.addTestSuite(TestShapes.class);
+
 		return suite;
 	}
 }

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java Fri Sep 17 13:46:11 2010
@@ -17,6 +17,7 @@
 package org.apache.poi.hwpf;
 
 import org.apache.poi.POIDataSamples;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 import java.io.*;
 
@@ -30,6 +31,14 @@ public class HWPFTestDataSamples {
             throw new RuntimeException(e);
         }
     }
+    public static HWPFOldDocument openOldSampleFile(String sampleFileName) {
+       try {
+           InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);
+           return new HWPFOldDocument(new POIFSFileSystem(is));
+       } catch (IOException e) {
+           throw new RuntimeException(e);
+       }
+   }
     /**
      * Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back
      * from a <tt>ByteArrayInputStream</tt>.<p/>

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java?rev=998131&r1=998130&r2=998131&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java Fri Sep 17 13:46:11 2010
@@ -52,7 +52,7 @@ public final class TestWordExtractor ext
 
 	// Well behaved document
 	private WordExtractor extractor;
-	// Corrupted document - can't do paragraph based stuff
+	// Slightly iffy document
 	private WordExtractor extractor2;
 	// A word doc embeded in an excel file
 	private String filename3;
@@ -93,8 +93,11 @@ public final class TestWordExtractor ext
 			assertEquals(p_text1[i], text[i]);
 		}
 
-		// On second one, should fall back
-		assertEquals(1, extractor2.getParagraphText().length);
+		// Lots of paragraphs with only a few lines in them
+		assertEquals(24, extractor2.getParagraphText().length);
+		assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
+      assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
+      assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
 	}
 
 	/**
@@ -103,8 +106,11 @@ public final class TestWordExtractor ext
 	public void testGetText() {
 		assertEquals(p_text1_block, extractor.getText());
 
-		// On second one, should fall back to text piece
-		assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
+		// For the 2nd, should give similar answers for
+		//  the two methods, differing only in line endings
+		assertEquals(
+		      extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), 
+		      extractor2.getText().replaceAll("[\\r\\n]", ""));
 	}
 
 	/**

Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java?rev=998131&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java (added)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java Fri Sep 17 13:46:11 2010
@@ -0,0 +1,122 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.OldFileFormatException;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFTestCase;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+/**
+ * Tests for Word 6 and Word 95 support
+ */
+public final class TestHWPFOldDocument extends HWPFTestCase {
+   /**
+    * Test a simple Word 6 document
+    */
+   public void testWord6() throws Exception {
+      // Can't open as HWPFDocument
+      try {
+         HWPFTestDataSamples.openSampleFile("Word6.doc");
+         fail("Shouldn't be openable");
+      } catch(OldFileFormatException e) {}
+      
+      // Open
+      HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
+      
+      // Check
+      assertEquals(1, doc.getRange().numSections());
+      assertEquals(1, doc.getRange().numParagraphs());
+      assertEquals(1, doc.getRange().numCharacterRuns());
+      
+      assertEquals(
+            "The quick brown fox jumps over the lazy dog\r",
+            doc.getRange().getParagraph(0).text()
+      );
+   }
+   
+   /**
+    * Test a simple Word 95 document
+    */
+   public void testWord95() throws Exception {
+      // Can't open as HWPFDocument
+      try {
+         HWPFTestDataSamples.openSampleFile("Word95.doc");
+         fail("Shouldn't be openable");
+      } catch(OldFileFormatException e) {}
+      
+      // Open
+      HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
+      
+      // Check
+      assertEquals(1, doc.getRange().numSections());
+      assertEquals(7, doc.getRange().numParagraphs());
+      
+      assertEquals(
+            "The quick brown fox jumps over the lazy dog\r",
+            doc.getRange().getParagraph(0).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(1).text());
+      assertEquals(
+            "Paragraph 2\r",
+            doc.getRange().getParagraph(2).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(3).text());
+      assertEquals(
+            "Paragraph 3. Has some RED text and some " +
+            "BLUE BOLD text in it.\r",
+            doc.getRange().getParagraph(4).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(5).text());
+      assertEquals(
+            "Last (4th) paragraph.\r",
+            doc.getRange().getParagraph(6).text()
+      );
+      
+      assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
+      // Normal, red, normal, blue+bold, normal
+      assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
+      // Normal, superscript for 4th, normal
+      assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
+   }
+   
+   /**
+    * Test a word document that has sections,
+    *  as well as the usual paragraph stuff.
+    */
+   public void testWord6Sections() throws Exception {
+      HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
+      
+      assertEquals(3, doc.getRange().numSections());
+      assertEquals(6, doc.getRange().numParagraphs());
+      
+      assertEquals(
+            "This is a test.\r",
+            doc.getRange().getParagraph(0).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(1).text());
+      assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
+      assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
+      assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
+      assertEquals("\r", doc.getRange().getParagraph(5).text());
+   }
+}

Added: poi/trunk/test-data/document/Word6_sections.doc
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/document/Word6_sections.doc?rev=998131&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/Word6_sections.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org