You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by Jeremias Maerki <de...@jeremias-maerki.ch> on 2011/02/16 11:44:20 UTC
Re: svn commit: r1070125 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Andreas,
I'm afraid this breaks compatibility with Java 5. java.util.Navigable*
was introduced with Java 6. We're still at Java 5, aren't we? README.txt
says so anyway.
Thanks!
On 12.02.2011 19:47:06 lehmi wrote:
> Author: lehmi
> Date: Sat Feb 12 18:47:06 2011
> New Revision: 1070125
>
> URL: http://svn.apache.org/viewvc?rev=1070125&view=rev
> Log:
> PDFBOX-956: improved suppress duplicates algorithm when extracting text as proposed by Kevin Jackson
>
> Modified:
> pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
>
> Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1070125&r1=1070124&r2=1070125&view=diff
> ==============================================================================
> --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
> +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sat Feb 12 18:47:06 2011
> @@ -26,7 +26,11 @@ import java.util.Iterator;
> import java.util.LinkedList;
> import java.util.List;
> import java.util.Map;
> +import java.util.NavigableMap;
> +import java.util.NavigableSet;
> import java.util.Properties;
> +import java.util.TreeMap;
> +import java.util.TreeSet;
> import java.util.Vector;
> import java.util.regex.Pattern;
>
> @@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
> */
> protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
>
> - private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
> + private Map<String, NavigableMap<Float, NavigableSet<Float>>> characterListMapping =
> + new HashMap<String, NavigableMap<Float, NavigableSet<Float>>>();
>
> /**
> * encoding that text will be written in (or null).
> @@ -880,10 +885,10 @@ public class PDFTextStripper extends PDF
> String textCharacter = text.getCharacter();
> float textX = text.getX();
> float textY = text.getY();
> - List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
> + NavigableMap<Float, NavigableSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
> if( sameTextCharacters == null )
> {
> - sameTextCharacters = new ArrayList<TextPosition>();
> + sameTextCharacters = new TreeMap<Float, NavigableSet<Float>>();
> characterListMapping.put( textCharacter, sameTextCharacters );
> }
>
> @@ -900,27 +905,29 @@ public class PDFTextStripper extends PDF
> //
> boolean suppressCharacter = false;
> float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
> - for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
> - {
> - TextPosition character = sameTextCharacters.get( i );
> - String charCharacter = character.getCharacter();
> - float charX = character.getX();
> - float charY = character.getY();
> - //only want to suppress
> -
> - if( charCharacter != null &&
> - //charCharacter.equals( textCharacter ) &&
> - within( charX, textX, tolerance ) &&
> - within( charY,
> - textY,
> - tolerance ) )
> +
> + NavigableMap<Float, NavigableSet<Float>> xMatches =
> + sameTextCharacters.subMap(textX - tolerance , false, textX + tolerance , false);
> + for (NavigableSet<Float> xMatch : xMatches.values())
> + {
> + NavigableSet<Float> yMatches =
> + xMatch.subSet(textY - tolerance , false, textY + tolerance , false);
> + if (!yMatches.isEmpty())
> {
> suppressCharacter = true;
> + break;
> }
> }
> +
> if( !suppressCharacter )
> {
> - sameTextCharacters.add( text );
> + NavigableSet<Float> ySet = sameTextCharacters.get(textX);
> + if (ySet == null)
> + {
> + ySet = new TreeSet<Float>();
> + sameTextCharacters.put( textX, ySet );
> + }
> + ySet.add( textY );
> showCharacter = true;
> }
> }
>
Jeremias Maerki