You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ca...@apache.org on 2009/01/13 16:17:32 UTC
svn commit: r734151 - in /incubator/pdfbox/trunk: ./ external/
src/main/java/org/apache/pdfbox/examples/util/
src/main/java/org/apache/pdfbox/pdfviewer/
src/main/java/org/apache/pdfbox/util/
src/main/java/org/apache/pdfbox/util/operator/ test/input/
Author: carrier
Date: Tue Jan 13 07:17:01 2009
New Revision: 734151
URL: http://svn.apache.org/viewvc?rev=734151&view=rev
Log:
Fix for PDFBOX-377 along with cleanup to make method names more consistent
Added:
incubator/pdfbox/trunk/external/icu4j-4_0.jar (with props)
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java (with props)
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java (with props)
Modified:
incubator/pdfbox/trunk/README.txt
incubator/pdfbox/trunk/build.xml
incubator/pdfbox/trunk/pom.xml
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt
incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
incubator/pdfbox/trunk/test/input/cweb.pdf.txt
Modified: incubator/pdfbox/trunk/README.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/README.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/README.txt (original)
+++ incubator/pdfbox/trunk/README.txt Tue Jan 13 07:17:01 2009
@@ -30,6 +30,15 @@
in your classpath. The easiest solution is to simply include the
apache-pdfbox-x.x.x.jar in your classpath.
+3. You get text that has the correct characters, but in the wrong
+ order. This mght be because you have not enabled sorting. The text
+ in PDF files is stored in chunks and the chunks do not need to be stored
+ in the order that they are displayed on a page. By default, PDFBox does
+ not sort the text. Also, if you have text in a language that reads right to left
+ (such as Arabic or Hebrew), make sure you have the ICU4J jar file in your
+ classpath. This library is needed to properly hande right to left text.
+
+
See the issue tracker at https://issues.apache.org/jira/browse/PDFBOX for
the full list of known issues and requested features.
Modified: incubator/pdfbox/trunk/build.xml
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/build.xml?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/build.xml (original)
+++ incubator/pdfbox/trunk/build.xml Tue Jan 13 07:17:01 2009
@@ -69,6 +69,7 @@
<property name="jai_core.jar" value="${jar.dir}/${jai_core.name}.jar" />
<property name="junit.jar" value="${jar.dir}/junit.jar" />
+ <property name="icu4j.jar" value="${jar.dir}/icu4j-4_0.jar" />
<property name="testoutput.dir" value="test/output"/>
@@ -116,6 +117,7 @@
<junit printsummary="off" fork="on" dir=".">
<sysproperty key="org.apache.pdfbox.util.TextStripper.file" value="${file}"/>
<classpath>
+ <pathelement path="${junit.jar}" />
<pathelement path="${ant.jar}" />
<pathelement path="${lucene.jar}" />
<pathelement path="${lucene-demo.jar}" />
@@ -125,6 +127,7 @@
<pathelement path="${bcmail.jar}" />
<pathelement path="${jai_codec.jar}" />
<pathelement path="${jai_core.jar}" />
+ <pathelement path="${icu4j.jar}" />
<pathelement path="${dest.dir}"/>
<pathelement path="${resources.dir}" />
</classpath>
@@ -140,6 +143,7 @@
<sysproperty key="org.apache.pdfbox.util.TextStripper.file" value="${file}"/>
<classpath>
+ <pathelement path="${junit.jar}" />
<pathelement path="${ant.jar}" />
<pathelement path="${lucene.jar}" />
<pathelement path="${lucene-demo.jar}" />
@@ -149,6 +153,7 @@
<pathelement path="${bcmail.jar}" />
<pathelement path="${jai_codec.jar}" />
<pathelement path="${jai_core.jar}" />
+ <pathelement path="${icu4j.jar}" />
<pathelement path="${dest.dir}"/>
<pathelement path="${resources.dir}" />
</classpath>
@@ -160,6 +165,7 @@
<target name="test-junit" depends="clean,compile" description="run junit tests">
<junit printsummary="off" fork="on" dir=".">
<classpath>
+ <pathelement path="${junit.jar}" />
<pathelement path="${ant.jar}" />
<pathelement path="${lucene.jar}" />
<pathelement path="${lucene-demo.jar}" />
@@ -286,6 +292,7 @@
<include name="${jai_codec.jar}" />
<include name="${jai_core.jar}" />
<include name="${checkstyle.jar}" />
+ <include name="${icu4j.jar}" />
<include name="${bin.dir}/**/*" />
<include name="pom.xml" />
<include name="build.xml" />
Added: incubator/pdfbox/trunk/external/icu4j-4_0.jar
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/external/icu4j-4_0.jar?rev=734151&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/pdfbox/trunk/external/icu4j-4_0.jar
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/pdfbox/trunk/external/icu4j-4_0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: incubator/pdfbox/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/pom.xml?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/pom.xml (original)
+++ incubator/pdfbox/trunk/pom.xml Tue Jan 13 07:17:01 2009
@@ -92,6 +92,13 @@
</dependency>
<dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ <version>3.8</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.5</version>
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java Tue Jan 13 07:17:01 2009
@@ -105,11 +105,11 @@
/**
* A method provided as an event interface to allow a subclass to perform
- * some specific functionality when a character needs to be displayed.
+ * some specific functionality when text needs to be processed
*
- * @param text The character to be displayed.
+ * @param text The text to be processed
*/
- protected void showCharacter( TextPosition text )
+ protected void processTextPosition( TextPosition text )
{
System.out.println( "String[" + text.getX() + "," +
text.getY() + " fs=" + text.getFontSize() + " xscale=" +
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java Tue Jan 13 07:17:01 2009
@@ -130,11 +130,11 @@
/**
* You should override this method if you want to perform an action when a
- * string is being shown.
+ * text is being processed.
*
- * @param text The string to display.
+ * @param text The text to process
*/
- protected void showCharacter( TextPosition text )
+ protected void processTextPosition( TextPosition text )
{
//should use colorspaces for the font color but for now assume that
//the font color is black
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java?rev=734151&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java Tue Jan 13 07:17:01 2009
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import com.ibm.icu.text.Bidi;
+import com.ibm.icu.text.Normalizer;
+
+/**
+ * This class is an implementation the the ICU4J class. TextNormalize
+ * will call this only if the ICU4J library exists in the classpath.
+ */
+public class ICU4JImpl {
+ Bidi bidi;
+
+ public ICU4JImpl() {
+ bidi = new Bidi();
+
+ /* We do not use bidi.setInverse() because that uses
+ * Bidi.REORDER_INVERSE_NUMBERS_AS_L, which caused problems
+ * in some test files. For example, a file had a line of:
+ * 0 1 / ARABIC
+ * and the 0 and 1 were reversed in the end result.
+ * REORDER_INVERSE_LIKE_DIRECT is the inverse Bidi mode
+ * that more closely reflects the Unicode spec.
+ */
+ bidi.setReorderingMode(Bidi.REORDER_INVERSE_LIKE_DIRECT);
+ }
+
+ /**
+ * Takes a line of text in presentation order and converts it to logical order.
+ * @see TextNormalize.makeLineLogicalOrder(String, boolean)
+ *
+ */
+ public String makeLineLogicalOrder(String a_str, boolean a_isRtlDominant) {
+ bidi.setPara(a_str, a_isRtlDominant?Bidi.RTL:Bidi.LTR, null);
+
+ /* Set the mirror flag so that parentheses and other mirror symbols
+ * are properly reversed, when needed. With this removed, lines
+ * such as (CBA) in the PDF file will come out like )ABC( in logical
+ * order.
+ */
+ return bidi.writeReordered(Bidi.DO_MIRRORING);
+ }
+
+ /**
+ * Normalize presentation forms of characters to the seperate parts.
+ * @see TextNormalize.normalizePres(String)
+ *
+ * @param a_str String to normalize
+ * @return Normalized form
+ */
+ public String normalizePres(String a_str) {
+ String retStr = "";
+ for (int i = 0; i < a_str.length(); i++) {
+ /* We only normalize if the codepoint is in a given range. Otherwise,
+ * NFKC converts too many things that would cause confusion. For example,
+ * it converts the micro symbol in extended latin to the value in the greek
+ * script.
+ */
+ if (((a_str.codePointAt(i) >= 0xFB00) && (a_str.codePointAt(i) <= 0xFDFF)) ||
+ ((a_str.codePointAt(i) >= 0xFE70) && (a_str.codePointAt(i) <= 0xFEFF))) {
+ retStr += Normalizer.normalize(a_str.charAt(i), Normalizer.NFKC);
+ }
+ else {
+ retStr += a_str.charAt(i);
+ }
+ }
+ return retStr;
+ }
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
------------------------------------------------------------------------------
svn:executable = *
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Tue Jan 13 07:17:01 2009
@@ -224,26 +224,44 @@
}
/**
+ * @deprecated
+ * @see processTextPosition(TextPosition)
+ */
+ protected void showCharacter( TextPosition text )
+ {
+ processTextPosition(text);
+ }
+
+ /**
* A method provided as an event interface to allow a subclass to perform
- * some specific functionality when a character needs to be displayed.
+ * some specific functionality when text needs to be processed.
*
- * @param text The character to be displayed.
+ * @param text The text to be processed.
*/
- protected void showCharacter( TextPosition text )
+ protected void processTextPosition( TextPosition text )
{
//subclasses can override to provide specific functionality.
}
+
+ /**
+ * @deprecated
+ * @see processEncodedText(byte[])
+ */
+ public void showString( byte[] string ) throws IOException {
+ processEncodedText(string);
+ }
+
/**
- * You should override this method if you want to perform an action when a
- * string is being shown.
+ * Process encoded text from the PDF Stream.
+ * You should override this method if you want to perform an action when
+ * encoded text is being processed.
*
- * @param string The string to display.
+ * @param string The encoded text
*
- * @throws IOException If there is an error showing the string
+ * @throws IOException If there is an error processing the string
*/
-
- public void showString( byte[] string ) throws IOException
+ public void processEncodedText( byte[] string ) throws IOException
{
/* Note on variable names. There are three different units being used
* in this code. Character sizes are given in glyph units, text locations
@@ -425,7 +443,8 @@
// convert textMatrix at the end of the string to display units
Matrix textMatrixEndDisp = initialMatrix.multiply( textMatrix ).multiply( ctm );
- showCharacter(
+ // process the decoded text
+ processTextPosition(
new TextPosition(
page,
textMatrixStDisp,
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Tue Jan 13 07:17:01 2009
@@ -68,7 +68,7 @@
buf.append("</title>");
buf.append("</head>");
buf.append("<body>\n");
- getOutput().write(buf.toString());
+ super.writeString(buf.toString());
}
/**
@@ -82,9 +82,17 @@
}
/**
+ * @deprecated
* {@inheritDoc}
*/
- protected void flushText() throws IOException
+ protected void flushText() throws IOException {
+ writePage();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ protected void writePage() throws IOException
{
Iterator textIter = getCharactersByArticle().iterator();
@@ -94,7 +102,7 @@
writeHeader();
onFirstPage = false;
}
- super.flushText();
+ super.writePage();
}
/**
@@ -102,7 +110,7 @@
*/
public void endDocument(PDDocument pdf) throws IOException
{
- output.write("</body></html>");
+ super.writeString("</body></html>");
}
/**
@@ -170,7 +178,7 @@
{
if (! suppressParagraphs)
{
- getOutput().write("<p>");
+ super.writeString("<p>");
}
}
/**
@@ -182,58 +190,65 @@
{
if (! suppressParagraphs)
{
- getOutput().write("</p>");
+ super.writeString("</p>");
}
}
/**
- * {@inheritDoc}
+ * Write a string to the output stream and escape some HTML characters
*/
- protected void writeCharacters(TextPosition position ) throws IOException
+ protected void writeString(String chars) throws IOException
{
- if (position == beginTitle)
- {
- output.write("<H1>");
- suppressParagraphs = true;
- }
- if (position == afterEndTitle)
- {
- output.write("</H1>"); // end title and start first paragraph
- suppressParagraphs = false;
- }
-
- String chars = position.getCharacter();
-
for (int i = 0; i < chars.length(); i++)
{
char c = chars.charAt(i);
if ((c < 32) || (c > 126))
{
int charAsInt = c;
- output.write("&#" + charAsInt + ";");
+ super.writeString("&#" + charAsInt + ";");
}
else
{
switch (c)
{
- case 34:
- output.write(""");
- break;
- case 38:
- output.write("&");
- break;
- case 60:
- output.write("<");
- break;
- case 62:
- output.write(">");
- break;
- default:
- output.write(c);
+ case 34:
+ super.writeString(""");
+ break;
+ case 38:
+ super.writeString("&");
+ break;
+ case 60:
+ super.writeString("<");
+ break;
+ case 62:
+ super.writeString(">");
+ break;
+ default:
+ super.writeString(String.valueOf(c));
}
}
}
}
+
+ /**
+ * {@inheritDoc}
+ */
+ protected void writeCharacters(TextPosition position ) throws IOException
+ {
+ if (position == beginTitle)
+ {
+ super.writeString("<H1>");
+ suppressParagraphs = true;
+ }
+ if (position == afterEndTitle)
+ {
+ super.writeString("</H1>"); // end title and start first paragraph
+ suppressParagraphs = false;
+ }
+
+ writeString(position.getCharacter());
+ }
+
/**
* @return Returns the suppressParagraphs.
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Jan 13 07:17:01 2009
@@ -46,6 +46,10 @@
* formatting and such. Please note; it is up to clients of this class to verify that
* a specific user has the correct permissions to extract text from the
* PDF document.
+ *
+ * The basic flow of this process is that we get a document and use a series of
+ * processXXX() functions that work on smaller and smaller chunks of the page.
+ * Eventually, we fully process each page and then print it.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.70 $
@@ -94,6 +98,12 @@
protected Writer output;
/**
+ * The normalizer is used to remove text ligatures/presentation forms
+ * and to correct the direction of right to left text, such as Arabic and Hebrew.
+ */
+ private TextNormalize normalize = null;
+
+ /**
* Instantiate a new PDFTextStripper object. This object will load properties from
* Resources/PDFTextStripper.properties.
* @throws IOException If there is an error loading the properties.
@@ -318,7 +328,7 @@
characterListMapping.clear();
processStream( page, page.findResources(), content );
- flushText();
+ writePage();
endPage( page );
}
@@ -373,11 +383,22 @@
}
/**
- * This will print the text to the output stream.
+ * @deprecated
+ * @see writePage().
+ */
+ protected void flushText() throws IOException {
+ writePage();
+ }
+
+ /**
+ * This will print the text of the processed page to "output".
+ * It will estimate, based on the coordinates of the text, where
+ * newlines and word spacings should be placed. The text will be
+ * sorted only if that feature was enabled.
*
* @throws IOException If there is an error writing the text.
*/
- protected void flushText() throws IOException
+ protected void writePage() throws IOException
{
float maxYForLine = -1;
float minYTopForLine = Float.MAX_VALUE;
@@ -389,6 +410,11 @@
float maxHeightForLine = -1;
//float lastHeightForLine = -1;
TextPosition lastPosition = null;
+
+ if (normalize == null) {
+ normalize = new TextNormalize();
+ }
+
for( int i = 0; i < charactersByArticle.size(); i++)
{
startParagraph();
@@ -400,6 +426,63 @@
}
Iterator textIter = textList.iterator();
+
+ /* Before we can display the text, we need to do some normalizing.
+ * Arabic and Hebrew text is right to left and is typically stored
+ * in its logical format, which means that the rightmost character is
+ * stored first, followed by the second character from the right etc.
+ * However, PDF stores the text in presentation form, which is left to
+ * right. We need to do some normalization to convert the PDF data to
+ * the proper logical output format.
+ *
+ * Note that if we did not sort the text, then the output of reversing the
+ * text is undefined and can sometimes produce worse output then not trying
+ * to reverse the order. Sorting should be done for these languages.
+ * */
+
+ /* First step is to determine if we have any right to left text, and
+ * if so, is it dominant. */
+ int ltrCnt = 0;
+ int rtlCnt = 0;
+
+ while( textIter.hasNext() )
+ {
+ TextPosition position = (TextPosition)textIter.next();
+ String stringValue = position.getCharacter();
+
+ for (int a = 0; a < stringValue.length(); a++) {
+ byte dir = Character.getDirectionality(stringValue.charAt(a));
+ if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
+ (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
+ (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE )) {
+ ltrCnt++;
+ }
+ else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
+ (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
+ (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
+ (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE )) {
+ rtlCnt++;
+ }
+ }
+ }
+
+ // choose the dominant direction
+ boolean isRtlDominant = false;
+ if (rtlCnt > ltrCnt) {
+ isRtlDominant = true;
+ }
+
+ // we will later use this to skip reordering
+ boolean hasRtl = false;
+ if (rtlCnt > 0)
+ hasRtl = true;
+
+ /* Now cycle through to print the text.
+ * We queue up a line at a time before we print so that we can convert
+ * the line from presentation form to logical form (if needed). */
+ String lineStr = "";
+
+ textIter = textList.iterator(); // start from the beginning again
while( textIter.hasNext() )
{
TextPosition position = (TextPosition)textIter.next();
@@ -501,7 +584,22 @@
if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine ) ))
//maxYForLine - minYTopForLine)))
{
- processLineSeparator( position );
+ // If we have RTL text on the page, change the direction
+ if (hasRtl)
+ lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
+
+ /* normalize string to remove presentation forms.
+ * Note that this must come after the line direction
+ * conversion because the process looks ahead to the next
+ * logical character.
+ */
+ lineStr = normalize.normalizePres(lineStr);
+
+ writeString(lineStr);
+ lineStr = "";
+
+ writeLineSeparator( );
+
endOfLastTextX = -1;
expectedStartOfNextWordX = -1;
maxYForLine = -1;
@@ -512,12 +610,12 @@
}
- if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
- //only bother adding a space if the last character was not a space
- lastPosition.getCharacter() != null &&
- !lastPosition.getCharacter().endsWith( " " ) )
- {
- processWordSeparator( lastPosition, position );
+ if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
+ //only bother adding a space if the last character was not a space
+ lastPosition.getCharacter() != null &&
+ !lastPosition.getCharacter().endsWith( " " ) )
+ {
+ lineStr += getWordSeparator();
}
else
{
@@ -535,9 +633,10 @@
endOfLastTextX = positionX + positionWidth;
//endOfLastTextY = positionY;
+ // add it to the list
if (characterValue != null)
{
- writeCharacters( position );
+ lineStr += characterValue;
}
else
{
@@ -549,36 +648,81 @@
//lastHeightForLine = position.getHeight();
lastWordSpacing = wordSpacing;
}
+
+ // print the final line
+ if (lineStr.length() > 0) {
+ if (hasRtl)
+ lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
+
+ // normalize string to remove presentation forms
+ lineStr = normalize.normalizePres(lineStr);
+
+ writeString(lineStr);
+ }
+
endParagraph();
}
+ writePageSeperator();;
+ }
+ private boolean overlap( float y1, float height1, float y2, float height2 )
+ {
+ return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
+ (y1 <= y2 && y1 >= y2-height2);
+ }
+
+ /**
+ * Write the page separator value to the output stream
+ * @throws IOException
+ */
+ protected void writePageSeperator() throws IOException
+ {
// RDD - newline at end of flush - required for end of page (so that the top
// of the next page starts on its own line.
//
output.write(getPageSeparator());
-
output.flush();
}
-
- private boolean overlap( float y1, float height1, float y2, float height2 )
+
+ /**
+ * Write the line separator value to the output stream
+ * @throws IOException
+ */
+ protected void writeLineSeparator( ) throws IOException
{
- return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
- (y1 <= y2 && y1 >= y2-height2);
+ output.write(getLineSeparator());
}
-
+
+ /**
+ * @deprecated
+ * @see writeLineSeparator()
+ */
protected void processLineSeparator( TextPosition currentText ) throws IOException
{
- output.write(getLineSeparator());
+ writeLineSeparator();
}
- protected void processWordSeparator( TextPosition lastText, TextPosition currentText ) throws IOException
+ /**
+ * Write the word separator value to the output stream
+ * @throws IOException
+ */
+ protected void writeWordSeparator() throws IOException
{
output.write(getWordSeparator());
}
+
+ /**
+ * @deprecated
+ * @see writeWordSeparator()
+ */
+ protected void processWordSeparator( TextPosition lastText, TextPosition currentText ) throws IOException
+ {
+ writeWordSeparator();
+ }
/**
- * Write the string to the output stream.
+ * Write the string in TextPosition to the output stream.
*
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
@@ -587,6 +731,17 @@
{
output.write( text.getCharacter() );
}
+
+ /**
+ * Write a Java string to the output stream.
+ *
+ * @param text The text to write to the stream.
+ * @throws IOException If there is an error when writing the text.
+ */
+ protected void writeString( String text ) throws IOException
+ {
+ output.write( text );
+ }
/**
* This will determine of two floating point numbers are within a specified variance.
@@ -600,14 +755,25 @@
return second > first - variance && second < first + variance;
}
+
/**
- * This will show add a character to the list of characters to be printed to
- * the text file.
- *
- * @param text The description of the character to display.
+ * @deprecated
+ * {@inheritDoc}
*/
protected void showCharacter( TextPosition text )
{
+ processTextPosition(text);
+ }
+
+ /**
+ * This will process a TextPosition object and add the
+ * text to the list of characters on a page. It takes care of
+ * overlapping text.
+ *
+ * @param text The text to process.
+ */
+ protected void processTextPosition( TextPosition text )
+ {
boolean showCharacter = true;
if( suppressDuplicateOverlappingText )
{
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java Tue Jan 13 07:17:01 2009
@@ -116,9 +116,17 @@
}
/**
+ * @deprecated
* {@inheritDoc}
*/
- protected void showCharacter( TextPosition text )
+ protected void showCharacter( TextPosition text ) {
+ processTextPosition(text);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ protected void processTextPosition( TextPosition text )
{
Iterator regionIter = regionArea.keySet().iterator();
while( regionIter.hasNext() )
@@ -128,17 +136,26 @@
if( rect.contains( text.getX(), text.getY() ) )
{
charactersByArticle = (Vector)regionCharacterList.get( region );
- super.showCharacter( text );
+ super.processTextPosition( text );
}
}
}
+
+ /**
+ * @deprecated
+ * {@inheritDoc}
+ */
+ protected void flushText() throws IOException {
+ writePage();
+ }
+
/**
- * This will print the text to the output stream.
+ * This will print the processed page text to the output stream.
*
* @throws IOException If there is an error writing the text.
*/
- protected void flushText() throws IOException
+ protected void writePage() throws IOException
{
Iterator regionIter = regionArea.keySet().iterator();
while( regionIter.hasNext() )
@@ -146,7 +163,7 @@
String region = (String)regionIter.next();
charactersByArticle = (Vector)regionCharacterList.get( region );
output = (StringWriter)regionText.get( region );
- super.flushText();
+ super.writePage();
}
}
}
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java?rev=734151&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java Tue Jan 13 07:17:01 2009
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+/**
+ * This class allows a caller to normalize text in various ways.
+ * It will load the ICU4J jar file if it is defined on the classpath.
+ *
+ */
+public class TextNormalize {
+ private ICU4JImpl icu4j = null;
+
+ public TextNormalize() {
+ findICU4J();
+ }
+
+
+ private void findICU4J() {
+ // see if we can load the icu4j classes from the classpath
+ try {
+ this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Bidi");
+ this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Normalizer");
+ icu4j = new ICU4JImpl();
+ } catch (ClassNotFoundException e) {
+ icu4j = null;
+ }
+ }
+
+
+ /**
+ * Takes a line of text in presentation order and converts it to logical order.
+ * For most text other than Arabic and Hebrew, the presentation and logical
+ * orders are the same. However, for Arabic and Hebrew, they are different and
+ * if the text involves both RTL and LTR text then the Unicode BIDI algorithm
+ * must be used to determine how to map between them.
+ *
+ * @param a_str Presentation form of line to convert (i.e. left most char is first char)
+ * @param a_isRtlDominant true if the PAGE has a dominant right to left ordering
+ * @return Logical form of string (or original string if ICU4J library is not on classpath)
+ */
+ public String makeLineLogicalOrder(String a_str, boolean a_isRtlDominant) {
+ if (icu4j != null) {
+ return icu4j.makeLineLogicalOrder(a_str, a_isRtlDominant);
+ }
+ else {
+ return a_str;
+ }
+ }
+
+ /**
+ * Normalize the presentation forms of characters in the string.
+ * For example, convert the single "fi" ligature to "f" and "i".
+ *
+ * @param a_str String to normalize
+ * @return Normalized string (or original string if ICU4J library is not on classpath)
+ */
+ public String normalizePres(String a_str) {
+ if (icu4j != null) {
+ return icu4j.normalizePres(a_str);
+ }
+ else {
+ return a_str;
+ }
+ }
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
------------------------------------------------------------------------------
svn:executable = *
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java Tue Jan 13 07:17:01 2009
@@ -47,7 +47,7 @@
public void process(PDFOperator operator, List arguments) throws IOException
{
COSString string = (COSString)arguments.get( 0 );
- context.showString( string.getBytes() );
+ context.processEncodedText( string.getBytes() );
}
}
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java Tue Jan 13 07:17:01 2009
@@ -63,7 +63,7 @@
}
else if( next instanceof COSString )
{
- context.showString( ((COSString)next).getBytes() );
+ context.processEncodedText( ((COSString)next).getBytes() );
}
else
{
Modified: incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
Binary files - no diff available.
Modified: incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
Binary files - no diff available.
Modified: incubator/pdfbox/trunk/test/input/cweb.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/cweb.pdf.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
Binary files - no diff available.
Re: svn commit: r734151 - in /incubator/pdfbox/trunk: ./ external/ src/main/java/org/apache/pdfbox/examples/util/ src/main/java/org/apache/pdfbox/pdfviewer/ src/main/java/org/apache/pdfbox/util/ src/main/java/org/apache/pdfbox/util/operator/ test/input/
Posted by Jeremias Maerki <de...@jeremias-maerki.ch>.
FYI, this change introduced a Java 1.5 dependency in ICU4JImpl.
Please set up your PDFBox working copies so they are compiled using Java
1.4 in your IDEs. Thanks.
On 13.01.2009 16:17:32 carrier wrote:
> Author: carrier
> Date: Tue Jan 13 07:17:01 2009
> New Revision: 734151
>
> URL: http://svn.apache.org/viewvc?rev=734151&view=rev
> Log:
> Fix for PDFBOX-377 along with cleanup to make method names more consistent
>
> Added:
> incubator/pdfbox/trunk/external/icu4j-4_0.jar (with props)
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java (with props)
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java (with props)
> Modified:
> incubator/pdfbox/trunk/README.txt
> incubator/pdfbox/trunk/build.xml
> incubator/pdfbox/trunk/pom.xml
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java
> incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
> incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt
> incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
> incubator/pdfbox/trunk/test/input/cweb.pdf.txt
>
<snip/>
Jeremias Maerki