You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/12/12 15:03:22 UTC
svn commit: r1044823 - in /pdfbox/trunk/pdfbox/src:
main/java/org/apache/pdfbox/util/
main/java/org/apache/pdfbox/util/operator/ test/java/org/apache/pdfbox/
test/java/org/apache/pdfbox/util/
Author: lehmi
Date: Sun Dec 12 14:03:21 2010
New Revision: 1044823
URL: http://svn.apache.org/viewvc?rev=1044823&view=rev
Log:
PDFBOX-893: added some improvements concerning text extraction and matrices as suggested by Neil McErlean
Added:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestMatrix.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java?rev=1044823&r1=1044822&r2=1044823&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java Sun Dec 12 14:03:21 2010
@@ -26,19 +26,31 @@ import java.awt.geom.AffineTransform;
*/
public class Matrix implements Cloneable
{
- private float[] single =
+ static final float[] default_single =
{
1,0,0,
0,1,0,
0,0,1
};
+ private float[] single;
+
/**
* Constructor.
*/
public Matrix()
{
- //default constructor
+ single = new float[default_single.length];
+ reset();
+ }
+
+ /**
+ * This method resets the numbers in this Matrix to the original values, which are
+ * the values that a newly constructed Matrix would have.
+ */
+ public void reset()
+ {
+ System.arraycopy(default_single, 0, single, 0, default_single.length);
}
/**
@@ -144,22 +156,69 @@ public class Matrix implements Cloneable
*/
public Matrix multiply( Matrix b )
{
- Matrix result = new Matrix();
+ return this.multiply(b, new Matrix());
+ }
- if (b != null && b.single != null)
+ /**
+ * This method multiplies this Matrix with the specified other Matrix, storing the product in the specified
+ * result Matrix. By reusing Matrix instances like this, multiplication chains can be executed without having
+ * to create many temporary Matrix objects.
+ * <p/>
+ * It is allowed to have (other == this) or (result == this) or indeed (other == result) but if this is done,
+ * the backing float[] matrix values may be copied in order to ensure a correct product.
+ *
+ * @param other the second operand Matrix in the multiplication
+ * @param result the Matrix instance into which the result should be stored. If result is null, a new Matrix
+ * instance is created.
+ * @return the product of the two matrices.
+ */
+ public Matrix multiply( Matrix other, Matrix result )
+ {
+ if (result == null)
+ {
+ result = new Matrix();
+ }
+
+ if (other != null && other.single != null)
{
- float[] bMatrix = b.single;
- float[] resultMatrix = result.single;
- resultMatrix[0] = single[0] * bMatrix[0] + single[1] * bMatrix[3] + single[2] * bMatrix[6];
- resultMatrix[1] = single[0] * bMatrix[1] + single[1] * bMatrix[4] + single[2] * bMatrix[7];
- resultMatrix[2] = single[0] * bMatrix[2] + single[1] * bMatrix[5] + single[2] * bMatrix[8];
- resultMatrix[3] = single[3] * bMatrix[0] + single[4] * bMatrix[3] + single[5] * bMatrix[6];
- resultMatrix[4] = single[3] * bMatrix[1] + single[4] * bMatrix[4] + single[5] * bMatrix[7];
- resultMatrix[5] = single[3] * bMatrix[2] + single[4] * bMatrix[5] + single[5] * bMatrix[8];
- resultMatrix[6] = single[6] * bMatrix[0] + single[7] * bMatrix[3] + single[8] * bMatrix[6];
- resultMatrix[7] = single[6] * bMatrix[1] + single[7] * bMatrix[4] + single[8] * bMatrix[7];
- resultMatrix[8] = single[6] * bMatrix[2] + single[7] * bMatrix[5] + single[8] * bMatrix[8];
+ // the operands
+ float[] thisOperand = this.single;
+ float[] otherOperand = other.single;
+
+ // We're multiplying 2 sets of floats together to produce a third, but we allow
+ // any of these float[] instances to be the same objects.
+ // There is the possibility then to overwrite one of the operands with result values
+ // and therefore corrupt the result.
+
+ // If either of these operands are the same float[] instance as the result, then
+ // they need to be copied.
+
+ if (this == result)
+ {
+ final float[] thisOrigVals = new float[this.single.length];
+ System.arraycopy(this.single, 0, thisOrigVals, 0, this.single.length);
+
+ thisOperand = thisOrigVals;
+ }
+ if (other == result)
+ {
+ final float[] otherOrigVals = new float[other.single.length];
+ System.arraycopy(other.single, 0, otherOrigVals, 0, other.single.length);
+
+ otherOperand = otherOrigVals;
+ }
+
+ result.single[0] = thisOperand[0] * otherOperand[0] + thisOperand[1] * otherOperand[3] + thisOperand[2] * otherOperand[6];
+ result.single[1] = thisOperand[0] * otherOperand[1] + thisOperand[1] * otherOperand[4] + thisOperand[2] * otherOperand[7];
+ result.single[2] = thisOperand[0] * otherOperand[2] + thisOperand[1] * otherOperand[5] + thisOperand[2] * otherOperand[8];
+ result.single[3] = thisOperand[3] * otherOperand[0] + thisOperand[4] * otherOperand[3] + thisOperand[5] * otherOperand[6];
+ result.single[4] = thisOperand[3] * otherOperand[1] + thisOperand[4] * otherOperand[4] + thisOperand[5] * otherOperand[7];
+ result.single[5] = thisOperand[3] * otherOperand[2] + thisOperand[4] * otherOperand[5] + thisOperand[5] * otherOperand[8];
+ result.single[6] = thisOperand[6] * otherOperand[0] + thisOperand[7] * otherOperand[3] + thisOperand[8] * otherOperand[6];
+ result.single[7] = thisOperand[6] * otherOperand[1] + thisOperand[7] * otherOperand[4] + thisOperand[8] * otherOperand[7];
+ result.single[8] = thisOperand[6] * otherOperand[2] + thisOperand[7] * otherOperand[5] + thisOperand[8] * otherOperand[8];
}
+
return result;
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1044823&r1=1044822&r2=1044823&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Sun Dec 12 14:03:21 2010
@@ -381,6 +381,11 @@ public class PDFStreamEngine
float pageWidth = page.findMediaBox().getWidth();
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
+ Matrix textXctm = new Matrix();
+ Matrix textMatrixEnd = new Matrix();
+ Matrix td = new Matrix();
+ Matrix tempMatrix = new Matrix();
+
int codeLength = 1;
for( int i=0; i<string.length; i+=codeLength)
{
@@ -434,14 +439,17 @@ public class PDFStreamEngine
{
spacingText += wordSpacingText;
}
+ textXctm = textMatrix.multiply(ctm, textXctm);
// Convert textMatrix to display units
- Matrix textMatrixStart = textStateParameters.multiply(textMatrix).multiply(ctm);
+ // We need to instantiate a new Matrix instance here as it is passed to the TextPosition constructor below.
+ Matrix textMatrixStart = textStateParameters.multiply(textXctm);
// TODO : tx should be set for horizontal text and ty for vertical text
// which seems to be specified in the font (not the direction in the matrix).
float tx = ((characterHorizontalDisplacementText)*fontSizeText)*horizontalScalingText;
float ty = 0;
- Matrix td = new Matrix();
+ // reset the matrix instead of creating a new one
+ td.reset();
td.setValue( 2, 0, tx );
td.setValue( 2, 1, ty );
@@ -450,16 +458,20 @@ public class PDFStreamEngine
// textMatrixEnd contains the coordinates of the end of the last glyph without
// taking characterSpacingText and spacintText into account, otherwise it'll be
// impossible to detect new words within text extraction
- Matrix textMatrixEnd = textStateParameters.multiply(td).multiply(textMatrix).multiply(ctm);
+ tempMatrix = textStateParameters.multiply(td, tempMatrix);
+ textMatrixEnd = tempMatrix.multiply(textXctm, textMatrixEnd);
+ final float endXPosition = textMatrixEnd.getXPosition();
+ final float endYPosition = textMatrixEnd.getYPosition();
// add some spacing to the text matrix (see comment above)
tx = ((characterHorizontalDisplacementText)*fontSizeText+characterSpacingText+spacingText)*horizontalScalingText;
td.setValue( 2, 0, tx );
- textMatrix = td.multiply( textMatrix );
+ textMatrix = td.multiply(textMatrix, textMatrix );
// determine the width of this character
// XXX: Note that if we handled vertical text, we should be using Y here
- float widthText = textMatrixEnd.getXPosition() - textMatrixStart.getXPosition();
+ float startXPosition = textMatrixStart.getXPosition();
+ float widthText = endXPosition - startXPosition;
//there are several cases where one character code will
//output multiple characters. For example "fi" or a
@@ -485,7 +497,8 @@ public class PDFStreamEngine
pageWidth,
pageHeight,
textMatrixStart,
- textMatrixEnd,
+ endXPosition,
+ endYPosition,
totalVerticalDisplacementDisp,
widthText,
spaceWidthDisp,
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java?rev=1044823&r1=1044822&r2=1044823&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java Sun Dec 12 14:03:21 2010
@@ -127,6 +127,8 @@ public class TextPosition
* @param currentFont The current for for this text position.
* @param fontSizeValue The new font size.
* @param fontSizeInPt The font size in pt units.
+ *
+ * @deprecated Use {@link TextPosition(int, float, float, Matrix, float, float, float, float, float, String, PDFont, float, int)} instead.
*/
public TextPosition(
int pageRotation,
@@ -143,10 +145,48 @@ public class TextPosition
int fontSizeInPt
)
{
+ this(pageRotation, pageWidth, pageHeight, textPositionSt,
+ textPositionEnd.getXPosition(), textPositionEnd.getYPosition(),
+ maxFontH, individualWidth, spaceWidth, string, currentFont, fontSizeValue, fontSizeInPt);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param pageRotation rotation of the page that the text is located in
+ * @param pageWidth rotation of the page that the text is located in
+ * @param pageHeight rotation of the page that the text is located in
+ * @param textPositionSt TextMatrix for start of text (in display units)
+ * @param endX
+ * @param endY
+ * @param maxFontH Maximum height of text (in display units)
+ * @param individualWidth The width of the given character/string. (in ? units)
+ * @param spaceWidth The width of the space character. (in display units)
+ * @param string The character to be displayed.
+ * @param currentFont The current for for this text position.
+ * @param fontSizeValue The new font size.
+ * @param fontSizeInPt The font size in pt units.
+ */
+ public TextPosition(
+ int pageRotation,
+ float pageWidth,
+ float pageHeight,
+ Matrix textPositionSt,
+ float endX,
+ float endY,
+ float maxFontH,
+ float individualWidth,
+ float spaceWidth,
+ String string,
+ PDFont currentFont,
+ float fontSizeValue,
+ int fontSizeInPt
+ )
+ {
this.textPos = textPositionSt;
- this.endX = textPositionEnd.getXPosition();
- this.endY = textPositionEnd.getYPosition();
+ this.endX = endX;
+ this.endY = endY;
this.rot = pageRotation;
// make sure it is 0 to 270 and no negative numbers
@@ -166,6 +206,7 @@ public class TextPosition
this.fontSize = fontSizeValue;
this.fontSizePt = fontSizeInPt;
}
+
/**
* Return the string of characters stored in this object.
*
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java?rev=1044823&r1=1044822&r2=1044823&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java Sun Dec 12 14:03:21 2010
@@ -55,7 +55,7 @@ public class ShowTextGlyph extends Opera
adjustment=-(adjustment/1000)*horizontalScaling*fontsize;
// TODO vertical writing mode
adjMatrix.setValue( 2, 0, adjustment );
- context.setTextMatrix( adjMatrix.multiply(context.getTextMatrix()) );
+ context.setTextMatrix( adjMatrix.multiply(context.getTextMatrix(), adjMatrix) );
}
else if( next instanceof COSString )
{
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java?rev=1044823&r1=1044822&r2=1044823&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java Sun Dec 12 14:03:21 2010
@@ -27,6 +27,7 @@ import org.apache.pdfbox.pdmodel.TestPDD
import org.apache.pdfbox.pdmodel.TestPDDocumentInformation;
import org.apache.pdfbox.pdmodel.interactive.form.TestFields;
import org.apache.pdfbox.util.TestDateUtil;
+import org.apache.pdfbox.util.TestMatrix;
/**
* This is a holder for all test cases in the pdfbox system.
@@ -67,6 +68,7 @@ public class TestAll extends TestCase
{
TestSuite suite = new TestSuite();
suite.addTest( TestDateUtil.suite() );
+ suite.addTest( TestMatrix.suite() );
suite.addTestSuite( TestFilters.class );
suite.addTest( TestFDF.suite() );
suite.addTest( TestFields.suite() );
Added: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestMatrix.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestMatrix.java?rev=1044823&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestMatrix.java (added)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestMatrix.java Sun Dec 12 14:03:21 2010
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import java.io.IOException;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Test the {@link Matrix} class.
+ * @author Neil McErlean
+ * @since 1.4.0
+ */
+public class TestMatrix extends TestCase
+{
+ /**
+ * Test class constructor.
+ *
+ * @param name The name of the test class.
+ *
+ * @throws IOException If there is an error creating the test.
+ */
+ public TestMatrix( String name ) throws IOException
+ {
+ super( name );
+ }
+
+ public void testConstructionAndCopy() throws Exception
+ {
+ Matrix m1 = new Matrix();
+ assertMatrixIsPristine(m1);
+
+ Matrix m2 = m1.copy();
+ assertNotSame(m1, m2);
+ assertMatrixIsPristine(m2);
+ }
+
+ public void testMultiplication() throws Exception
+ {
+ // This matrix will not change - we use it to drive the various multiplications.
+ final Matrix testMatrix = new Matrix();
+
+ // Create matrix with values
+ // [ 0, 1, 2
+ // 1, 2, 3
+ // 2, 3, 4]
+ for (int x = 0; x < 3; x++)
+ {
+ for (int y = 0; y < 3; y++)
+ {
+ testMatrix.setValue(x, y, x + y);
+ }
+ }
+
+ Matrix m1 = testMatrix.copy();
+ Matrix m2 = testMatrix.copy();
+
+ // Multiply two matrices together producing a new result matrix.
+ Matrix product = m1.multiply(m2);
+
+ assertNotSame(m1, product);
+ assertNotSame(m2, product);
+
+ // Operand 1 should not have changed
+ assertMatrixValuesEqualTo(new float[] {0, 1, 2,
+ 1, 2, 3,
+ 2, 3, 4}, m1);
+ // Operand 2 should not have changed
+ assertMatrixValuesEqualTo(new float[] {0, 1, 2,
+ 1, 2, 3,
+ 2, 3, 4}, m2);
+ assertMatrixValuesEqualTo(new float[] {5, 8, 11,
+ 8, 14, 20,
+ 11, 20, 29}, product);
+ product.reset();
+ assertMatrixIsPristine(product);
+
+
+
+ // Multiply two matrices together with the result being written to a third matrix
+ // (Any existing values there will be overwritten).
+ Matrix resultMatrix = new Matrix();
+
+ Matrix retVal = m1.multiply(m2, resultMatrix);
+ assertSame(retVal, resultMatrix);
+ // Operand 1 should not have changed
+ assertMatrixValuesEqualTo(new float[] {0, 1, 2,
+ 1, 2, 3,
+ 2, 3, 4}, m1);
+ // Operand 2 should not have changed
+ assertMatrixValuesEqualTo(new float[] {0, 1, 2,
+ 1, 2, 3,
+ 2, 3, 4}, m2);
+ assertMatrixValuesEqualTo(new float[] {5, 8, 11,
+ 8, 14, 20,
+ 11, 20, 29}, resultMatrix);
+
+
+
+ // Multiply two matrices together with the result being written into the other matrix
+ retVal = m1.multiply(m2, m2);
+ assertSame(retVal, m2);
+ // Operand 1 should not have changed
+ assertMatrixValuesEqualTo(new float[] {0, 1, 2,
+ 1, 2, 3,
+ 2, 3, 4}, m1);
+ assertMatrixValuesEqualTo(new float[] {5, 8, 11,
+ 8, 14, 20,
+ 11, 20, 29}, retVal);
+
+
+
+ // Multiply two matrices together with the result being written into 'this' matrix
+ m1 = testMatrix.copy();
+ m2 = testMatrix.copy();
+
+ retVal = m1.multiply(m2, m1);
+ assertSame(retVal, m1);
+ // Operand 2 should not have changed
+ assertMatrixValuesEqualTo(new float[] {0, 1, 2,
+ 1, 2, 3,
+ 2, 3, 4}, m2);
+ assertMatrixValuesEqualTo(new float[] {5, 8, 11,
+ 8, 14, 20,
+ 11, 20, 29}, retVal);
+
+
+
+ // Multiply the same matrix with itself with the result being written into 'this' matrix
+ m1 = testMatrix.copy();
+
+ retVal = m1.multiply(m1, m1);
+ assertSame(retVal, m1);
+ assertMatrixValuesEqualTo(new float[] {5, 8, 11,
+ 8, 14, 20,
+ 11, 20, 29}, retVal);
+ }
+
+ /**
+ * This method asserts that the matrix values for the given {@link Matrix} object are equal
+ * to the pristine, or original, values.
+ * @param m the Matrix to test.
+ */
+ private void assertMatrixIsPristine(Matrix m)
+ {
+ assertMatrixValuesEqualTo(new float[] {1 ,0 ,0,
+ 0, 1, 0,
+ 0, 0, 1}, m);
+ }
+
+ /**
+ * This method asserts that the matrix values for the given {@link Matrix} object have
+ * the specified values.
+ * @param values the expected values
+ * @param m the matrix to test
+ */
+ private void assertMatrixValuesEqualTo(float[] values, Matrix m) {
+ float delta = 0.00001f;
+ for (int i = 0; i < values.length; i++)
+ {
+ // Need to convert a (row, column) co-ordinate into a straight index.
+ int row = (int)Math.floor(i / 3);
+ int column = i % 3;
+ StringBuilder failureMsg = new StringBuilder();
+ failureMsg.append("Incorrect value for matrix[")
+ .append(row).append(",").append(column).append("]");
+ assertEquals(failureMsg.toString(), values[i], m.getValue(row, column), delta);
+ }
+ }
+
+ /**
+ * Set the tests in the suite for this test class.
+ *
+ * @return the Suite.
+ */
+ public static Test suite()
+ {
+ return new TestSuite( TestMatrix.class );
+ }
+
+ /**
+ * Command line execution.
+ *
+ * @param args Command line arguments.
+ */
+ public static void main( String[] args )
+ {
+ String[] arg = {TestMatrix.class.getName() };
+ junit.textui.TestRunner.main( arg );
+ }
+}