You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by se...@apache.org on 2011/09/07 14:12:17 UTC
svn commit: r1166144 - in /poi/trunk/src: documentation/content/xdocs/
scratchpad/src/org/apache/poi/hwpf/model/
scratchpad/testcases/org/apache/poi/hwpf/usermodel/
Author: sergey
Date: Wed Sep 7 12:12:17 2011
New Revision: 1166144
URL: http://svn.apache.org/viewvc?rev=1166144&view=rev
Log:
fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;
Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables
Modified:
poi/trunk/src/documentation/content/xdocs/status.xml
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Wed Sep 7 12:12:17 2011
@@ -34,6 +34,7 @@
<changes>
<release version="3.8-beta5" date="2011-??-??">
+ <action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java Wed Sep 7 12:12:17 2011
@@ -101,18 +101,21 @@ public class CHPBinTable
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
pageOffset, translator);
- int fkpSize = cfkp.size();
-
- for (int y = 0; y < fkpSize; y++)
- {
- final CHPX chpx = cfkp.getCHPX(y);
- if (chpx != null)
- _textRuns.add(chpx);
- }
+ for ( CHPX chpx : cfkp.getCHPXs() )
+ {
+ if ( chpx != null )
+ _textRuns.add( chpx );
+ }
}
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _textRuns.size() ), " elements)" );
+
+ if ( _textRuns.isEmpty() )
+ {
+ logger.log( POILogger.WARN, "CHPX FKPs are empty" );
+ _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
+ }
}
public void rebuild( ComplexFileTable complexFileTable )
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java Wed Sep 7 12:12:17 2011
@@ -18,6 +18,7 @@
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import org.apache.poi.hwpf.sprm.SprmBuffer;
@@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage
int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x );
- int charStartAt = translator.getCharIndex( bytesStartAt );
- int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+ // int charStartAt = translator.getCharIndex( bytesStartAt );
+ // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+ // );
- // TODO: CHECK!
- // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
- // ) );
- CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
- getGrpprl( x ), 0 ) );
- _chpxList.add( chpx );
+ for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+ bytesEndAt ) )
+ {
+ CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
+ getGrpprl( x ), 0 ) );
+ _chpxList.add( chpx );
+ }
}
}
@@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage
return _chpxList.get(index);
}
+ public List<CHPX> getCHPXs()
+ {
+ return Collections.unmodifiableList( _chpxList );
+ }
+
public void fill(List<CHPX> filler)
{
_chpxList.addAll(filler);
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java Wed Sep 7 12:12:17 2011
@@ -31,12 +31,16 @@ public interface CharIndexTranslator {
int getByteIndex( int charPos );
/**
- * Calculates the char index of the given byte index.
- * Look forward if index is not in table
- *
- * @param bytePos The character offset to check
+ * Calculates the char index of the given byte index. Look forward if index
+ * is not in table
+ *
+ * @param bytePos
+ * The character offset to check
* @return the char index
+ * @deprecated This API were based on incorrect assumption that single byte
+ * offset corresponds to single char offset
*/
+ @Deprecated
int getCharIndex(int bytePos);
/**
@@ -46,16 +50,29 @@ public interface CharIndexTranslator {
* @param bytePos The character offset to check
* @param startCP look from this characted position
* @return the char index
+ * @deprecated This API were based on incorrect assumption that single byte
+ * offset corresponds to single char offset
*/
+ @Deprecated
int getCharIndex(int bytePos, int startCP);
-
+
+ /**
+ * Finds character ranges that includes specified byte range.
+ *
+ * @param startBytePosInclusive
+ * start byte range
+ * @param endBytePosExclusive
+ * end byte range
+ */
+ int[][] getCharIndexRanges( int startBytePosInclusive,
+ int endBytePosExclusive );
+
/**
* Check if index is in table
- *
+ *
* @param bytePos
* @return true if index in table, false if not
*/
-
boolean isIndexInTable(int bytePos);
/**
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java Wed Sep 7 12:12:17 2011
@@ -92,12 +92,8 @@ public class PAPBinTable
documentStream, dataStream, pageOffset,
charIndexTranslator );
- int fkpSize = pfkp.size();
-
- for ( int y = 0; y < fkpSize; y++ )
+ for ( PAPX papx : pfkp.getPAPXs() )
{
- PAPX papx = pfkp.getPAPX( y );
-
if ( papx != null )
_paragraphs.add( papx );
}
@@ -107,6 +103,12 @@ public class PAPBinTable
logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" );
+
+ if ( _paragraphs.isEmpty() )
+ {
+ logger.log( POILogger.WARN, "PAPX FKPs are empty" );
+ _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
+ }
}
public void rebuild( final StringBuilder docText,
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java Wed Sep 7 12:12:17 2011
@@ -23,6 +23,8 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import org.apache.poi.hwpf.sprm.SprmBuffer;
+
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
@@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage
int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x );
- int charStartAt = translator.getCharIndex( bytesStartAt );
- int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+ // int charStartAt = translator.getCharIndex( bytesStartAt );
+ // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+ // );
+ // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+ // getParagraphHeight( x ), dataStream );
+ // _papxList.add( papx );
- PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
- getParagraphHeight( x ), dataStream );
- _papxList.add( papx );
+ for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+ bytesEndAt ) )
+ {
+ PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
+ getParagraphHeight( x ), dataStream );
+ _papxList.add( papx );
+ }
}
_fkp = null;
}
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java Wed Sep 7 12:12:17 2011
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
+import java.util.LinkedList;
import java.util.List;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@@ -107,8 +108,10 @@ public class TextPieceTable implements C
System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
// And now build the piece
- _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
- pieces[x] ) );
+ final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
+ pieces[x] );
+
+ _textPieces.add( newTextPiece );
}
// In the interest of our sanity, now sort the text pieces
@@ -201,11 +204,13 @@ public class TextPieceTable implements C
return byteCount;
}
+ @Deprecated
public int getCharIndex( int bytePos )
{
return getCharIndex( bytePos, 0 );
}
+ @Deprecated
public int getCharIndex( int startBytePos, int startCP )
{
int charCount = 0;
@@ -253,6 +258,42 @@ public class TextPieceTable implements C
return charCount;
}
+ public int[][] getCharIndexRanges( int startBytePosInclusive,
+ int endBytePosExclusive )
+ {
+ List<int[]> result = new LinkedList<int[]>();
+ for ( TextPiece textPiece : _textPiecesFCOrder )
+ {
+ final int tpStart = textPiece.getPieceDescriptor()
+ .getFilePosition();
+ final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+ + textPiece.bytesLength();
+ if ( startBytePosInclusive > tpEnd )
+ continue;
+ if ( endBytePosExclusive < tpStart )
+ break;
+
+ final int rangeStartBytes = Math.max( tpStart,
+ startBytePosInclusive );
+ final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
+ final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
+
+ if ( rangeStartBytes > rangeEndBytes )
+ continue;
+
+ final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+
+ final int rangeStartCp = textPiece.getStart()
+ + ( rangeStartBytes - tpStart ) / encodingMultiplier;
+ final int rangeEndCp = rangeStartCp + rangeLengthBytes
+ / encodingMultiplier;
+
+ result.add( new int[] { rangeStartCp, rangeEndCp } );
+ }
+
+ return result.toArray( new int[result.size()][] );
+ }
+
public int getCpMin()
{
return _cpMin;
@@ -377,24 +418,42 @@ public class TextPieceTable implements C
public int lookIndexForward( final int startBytePos )
{
- int bytePos = startBytePos;
- for ( TextPiece tp : _textPiecesFCOrder )
- {
- int pieceStart = tp.getPieceDescriptor().getFilePosition();
+ if ( _textPiecesFCOrder.isEmpty() )
+ throw new IllegalStateException( "Text pieces table is empty" );
- if ( bytePos >= pieceStart + tp.bytesLength() )
- {
- continue;
- }
+ if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
+ return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
- if ( pieceStart > bytePos )
- {
- bytePos = pieceStart;
- }
+ if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
+ .getPieceDescriptor().getFilePosition() <= startBytePos )
+ return startBytePos;
- break;
+ int low = 0;
+ int high = _textPiecesFCOrder.size() - 1;
+
+ while ( low <= high )
+ {
+ int mid = ( low + high ) >>> 1;
+ final TextPiece textPiece = _textPiecesFCOrder.get( mid );
+ int midVal = textPiece.getPieceDescriptor().getFilePosition();
+
+ if ( midVal < startBytePos )
+ low = mid + 1;
+ else if ( midVal > startBytePos )
+ high = mid - 1;
+ else
+ // found piece with exact start
+ return textPiece.getPieceDescriptor().getFilePosition();
}
- return bytePos;
+ assert low == high;
+ assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
+ .getFilePosition() < startBytePos;
+ // last line can't be current, can it?
+ assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
+ .getFilePosition() > startBytePos;
+
+ // shifting to next piece start
+ return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
}
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java?rev=1166144&r1=1166143&r2=1166144&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java Wed Sep 7 12:12:17 2011
@@ -227,6 +227,36 @@ public class TestBugs extends TestCase
}
/**
+ * Bug 44331 - HWPFDocument.write destroys fields
+ */
+ public void test44431_2()
+ {
+ HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
+ WordExtractor extractor1 = new WordExtractor( doc1 );
+
+ assertEquals( "File name=FieldsTest.doc\n" +
+ "\n" +
+ "\n" +
+ "STYLEREF test\n" +
+ "\n" +
+ "\n" +
+ "\n" +
+ "TEST TABLE OF CONTENTS\n" +
+ "\n" +
+ "Heading paragraph in next page\t2\n" +
+ "Another heading paragraph in further page\t3\n" +
+ "Another heading paragraph in further page\t3\n" +
+ "\n" +
+ "\n" +
+ "Heading paragraph in next page\n" +
+ "Another heading paragraph in further page\n" +
+ "\n" +
+ "\n" +
+ "\n" +
+ "Page 3 of 3", extractor1.getText() );
+ }
+
+ /**
* Bug 45473 - HWPF cannot read file after save
*/
public void test45473()
@@ -640,19 +670,20 @@ public class TestBugs extends TestCase
hwpfDocument.write( new ByteArrayOutputStream() );
}
-
-
/**
- * Bug 51678 - Extracting text from Bug51524.zip is slow
- * Bug 51524 - PapBinTable constructor is slow
+ * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
+ * PapBinTable constructor is slow
*/
public void test51678And51524()
{
- // YK: the test will run only if the poi.test.remote system property is set.
+ // YK: the test will run only if the poi.test.remote system property is
+ // set.
// TODO: refactor into something nicer!
- if(System.getProperty("poi.test.remote") != null) {
+ if ( System.getProperty( "poi.test.remote" ) != null )
+ {
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
- HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
+ HWPFDocument hwpfDocument = HWPFTestDataSamples
+ .openRemoteFile( href );
WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
wordExtractor.getText();
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org