You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2011/02/12 19:47:06 UTC
svn commit: r1070125 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Author: lehmi
Date: Sat Feb 12 18:47:06 2011
New Revision: 1070125
URL: http://svn.apache.org/viewvc?rev=1070125&view=rev
Log:
PDFBOX-956: improved suppress duplicates algorithm when extracting text as proposed by Kevin Jackson
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1070125&r1=1070124&r2=1070125&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sat Feb 12 18:47:06 2011
@@ -26,7 +26,11 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.NavigableMap;
+import java.util.NavigableSet;
import java.util.Properties;
+import java.util.TreeMap;
+import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Pattern;
@@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
*/
protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
- private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
+ private Map<String, NavigableMap<Float, NavigableSet<Float>>> characterListMapping =
+ new HashMap<String, NavigableMap<Float, NavigableSet<Float>>>();
/**
* encoding that text will be written in (or null).
@@ -880,10 +885,10 @@ public class PDFTextStripper extends PDF
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
- List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
+ NavigableMap<Float, NavigableSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
if( sameTextCharacters == null )
{
- sameTextCharacters = new ArrayList<TextPosition>();
+ sameTextCharacters = new TreeMap<Float, NavigableSet<Float>>();
characterListMapping.put( textCharacter, sameTextCharacters );
}
@@ -900,27 +905,29 @@ public class PDFTextStripper extends PDF
//
boolean suppressCharacter = false;
float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
- for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
- {
- TextPosition character = sameTextCharacters.get( i );
- String charCharacter = character.getCharacter();
- float charX = character.getX();
- float charY = character.getY();
- //only want to suppress
-
- if( charCharacter != null &&
- //charCharacter.equals( textCharacter ) &&
- within( charX, textX, tolerance ) &&
- within( charY,
- textY,
- tolerance ) )
+
+ NavigableMap<Float, NavigableSet<Float>> xMatches =
+ sameTextCharacters.subMap(textX - tolerance , false, textX + tolerance , false);
+ for (NavigableSet<Float> xMatch : xMatches.values())
+ {
+ NavigableSet<Float> yMatches =
+ xMatch.subSet(textY - tolerance , false, textY + tolerance , false);
+ if (!yMatches.isEmpty())
{
suppressCharacter = true;
+ break;
}
}
+
if( !suppressCharacter )
{
- sameTextCharacters.add( text );
+ NavigableSet<Float> ySet = sameTextCharacters.get(textX);
+ if (ySet == null)
+ {
+ ySet = new TreeSet<Float>();
+ sameTextCharacters.put( textX, ySet );
+ }
+ ySet.add( textY );
showCharacter = true;
}
}
Re: svn commit: r1070125 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Posted by Jeremias Maerki <de...@jeremias-maerki.ch>.
Andreas,
I'm afraid this breaks compatibility with Java 5. java.util.Navigable*
was introduced with Java 6. We're still at Java 5, aren't we? README.txt
says so anyway.
Thanks!
On 12.02.2011 19:47:06 lehmi wrote:
> Author: lehmi
> Date: Sat Feb 12 18:47:06 2011
> New Revision: 1070125
>
> URL: http://svn.apache.org/viewvc?rev=1070125&view=rev
> Log:
> PDFBOX-956: improved suppress duplicates algorithm when extracting text as proposed by Kevin Jackson
>
> Modified:
> pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
>
> Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1070125&r1=1070124&r2=1070125&view=diff
> ==============================================================================
> --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
> +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sat Feb 12 18:47:06 2011
> @@ -26,7 +26,11 @@ import java.util.Iterator;
> import java.util.LinkedList;
> import java.util.List;
> import java.util.Map;
> +import java.util.NavigableMap;
> +import java.util.NavigableSet;
> import java.util.Properties;
> +import java.util.TreeMap;
> +import java.util.TreeSet;
> import java.util.Vector;
> import java.util.regex.Pattern;
>
> @@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
> */
> protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
>
> - private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
> + private Map<String, NavigableMap<Float, NavigableSet<Float>>> characterListMapping =
> + new HashMap<String, NavigableMap<Float, NavigableSet<Float>>>();
>
> /**
> * encoding that text will be written in (or null).
> @@ -880,10 +885,10 @@ public class PDFTextStripper extends PDF
> String textCharacter = text.getCharacter();
> float textX = text.getX();
> float textY = text.getY();
> - List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
> + NavigableMap<Float, NavigableSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
> if( sameTextCharacters == null )
> {
> - sameTextCharacters = new ArrayList<TextPosition>();
> + sameTextCharacters = new TreeMap<Float, NavigableSet<Float>>();
> characterListMapping.put( textCharacter, sameTextCharacters );
> }
>
> @@ -900,27 +905,29 @@ public class PDFTextStripper extends PDF
> //
> boolean suppressCharacter = false;
> float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
> - for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
> - {
> - TextPosition character = sameTextCharacters.get( i );
> - String charCharacter = character.getCharacter();
> - float charX = character.getX();
> - float charY = character.getY();
> - //only want to suppress
> -
> - if( charCharacter != null &&
> - //charCharacter.equals( textCharacter ) &&
> - within( charX, textX, tolerance ) &&
> - within( charY,
> - textY,
> - tolerance ) )
> +
> + NavigableMap<Float, NavigableSet<Float>> xMatches =
> + sameTextCharacters.subMap(textX - tolerance , false, textX + tolerance , false);
> + for (NavigableSet<Float> xMatch : xMatches.values())
> + {
> + NavigableSet<Float> yMatches =
> + xMatch.subSet(textY - tolerance , false, textY + tolerance , false);
> + if (!yMatches.isEmpty())
> {
> suppressCharacter = true;
> + break;
> }
> }
> +
> if( !suppressCharacter )
> {
> - sameTextCharacters.add( text );
> + NavigableSet<Float> ySet = sameTextCharacters.get(textX);
> + if (ySet == null)
> + {
> + ySet = new TreeSet<Float>();
> + sameTextCharacters.put( textX, ySet );
> + }
> + ySet.add( textY );
> showCharacter = true;
> }
> }
>
Jeremias Maerki