You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by Jeremias Maerki <de...@jeremias-maerki.ch> on 2011/02/16 11:44:20 UTC

Re: svn commit: r1070125 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Andreas,
I'm afraid this breaks compatibility with Java 5. java.util.Navigable*
was introduced with Java 6. We're still at Java 5, aren't we? README.txt
says so anyway.

Thanks!

On 12.02.2011 19:47:06 lehmi wrote:
> Author: lehmi
> Date: Sat Feb 12 18:47:06 2011
> New Revision: 1070125
> 
> URL: http://svn.apache.org/viewvc?rev=1070125&view=rev
> Log:
> PDFBOX-956: improved suppress duplicates algorithm when extracting text as proposed by Kevin Jackson
> 
> Modified:
>     pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> 
> Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1070125&r1=1070124&r2=1070125&view=diff
> ==============================================================================
> --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
> +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sat Feb 12 18:47:06 2011
> @@ -26,7 +26,11 @@ import java.util.Iterator;
>  import java.util.LinkedList;
>  import java.util.List;
>  import java.util.Map;
> +import java.util.NavigableMap;
> +import java.util.NavigableSet;
>  import java.util.Properties;
> +import java.util.TreeMap;
> +import java.util.TreeSet;
>  import java.util.Vector;
>  import java.util.regex.Pattern;
>  
> @@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
>       */
>      protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
>  
> -    private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
> +    private Map<String, NavigableMap<Float, NavigableSet<Float>>> characterListMapping =
> +        new HashMap<String, NavigableMap<Float, NavigableSet<Float>>>();
>  
>      /**
>       * encoding that text will be written in (or null).
> @@ -880,10 +885,10 @@ public class PDFTextStripper extends PDF
>              String textCharacter = text.getCharacter();
>              float textX = text.getX();
>              float textY = text.getY();
> -            List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
> +            NavigableMap<Float, NavigableSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
>              if( sameTextCharacters == null )
>              {
> -                sameTextCharacters = new ArrayList<TextPosition>();
> +                sameTextCharacters = new TreeMap<Float, NavigableSet<Float>>();
>                  characterListMapping.put( textCharacter, sameTextCharacters );
>              }
>  
> @@ -900,27 +905,29 @@ public class PDFTextStripper extends PDF
>              //
>              boolean suppressCharacter = false;
>              float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
> -            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
> -            {
> -                TextPosition character = sameTextCharacters.get( i );
> -                String charCharacter = character.getCharacter();
> -                float charX = character.getX();
> -                float charY = character.getY();
> -                //only want to suppress
> -
> -                if( charCharacter != null &&
> -                        //charCharacter.equals( textCharacter ) &&
> -                        within( charX, textX, tolerance ) &&
> -                        within( charY,
> -                                textY,
> -                                tolerance ) )
> +            
> +            NavigableMap<Float, NavigableSet<Float>> xMatches =
> +                sameTextCharacters.subMap(textX - tolerance , false, textX + tolerance , false);
> +            for (NavigableSet<Float> xMatch : xMatches.values()) 
> +            {
> +                NavigableSet<Float> yMatches =
> +                    xMatch.subSet(textY - tolerance , false, textY + tolerance , false);
> +                if (!yMatches.isEmpty()) 
>                  {
>                      suppressCharacter = true;
> +                    break;
>                  }
>              }
> +
>              if( !suppressCharacter )
>              {
> -                sameTextCharacters.add( text );
> +                NavigableSet<Float> ySet = sameTextCharacters.get(textX);
> +                if (ySet == null) 
> +                {
> +                    ySet = new TreeSet<Float>();
> +                    sameTextCharacters.put( textX,  ySet );
> +                }
> +                ySet.add( textY );
>                  showCharacter = true;
>              }
>          }
> 




Jeremias Maerki