You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2011/02/12 19:47:06 UTC

svn commit: r1070125 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Author: lehmi
Date: Sat Feb 12 18:47:06 2011
New Revision: 1070125

URL: http://svn.apache.org/viewvc?rev=1070125&view=rev
Log:
PDFBOX-956: improved suppress duplicates algorithm when extracting text as proposed by Kevin Jackson

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1070125&r1=1070124&r2=1070125&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sat Feb 12 18:47:06 2011
@@ -26,7 +26,11 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.NavigableMap;
+import java.util.NavigableSet;
 import java.util.Properties;
+import java.util.TreeMap;
+import java.util.TreeSet;
 import java.util.Vector;
 import java.util.regex.Pattern;
 
@@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
      */
     protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
 
-    private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
+    private Map<String, NavigableMap<Float, NavigableSet<Float>>> characterListMapping =
+        new HashMap<String, NavigableMap<Float, NavigableSet<Float>>>();
 
     /**
      * encoding that text will be written in (or null).
@@ -880,10 +885,10 @@ public class PDFTextStripper extends PDF
             String textCharacter = text.getCharacter();
             float textX = text.getX();
             float textY = text.getY();
-            List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
+            NavigableMap<Float, NavigableSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
             if( sameTextCharacters == null )
             {
-                sameTextCharacters = new ArrayList<TextPosition>();
+                sameTextCharacters = new TreeMap<Float, NavigableSet<Float>>();
                 characterListMapping.put( textCharacter, sameTextCharacters );
             }
 
@@ -900,27 +905,29 @@ public class PDFTextStripper extends PDF
             //
             boolean suppressCharacter = false;
             float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
-            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
-            {
-                TextPosition character = sameTextCharacters.get( i );
-                String charCharacter = character.getCharacter();
-                float charX = character.getX();
-                float charY = character.getY();
-                //only want to suppress
-
-                if( charCharacter != null &&
-                        //charCharacter.equals( textCharacter ) &&
-                        within( charX, textX, tolerance ) &&
-                        within( charY,
-                                textY,
-                                tolerance ) )
+            
+            NavigableMap<Float, NavigableSet<Float>> xMatches =
+                sameTextCharacters.subMap(textX - tolerance , false, textX + tolerance , false);
+            for (NavigableSet<Float> xMatch : xMatches.values()) 
+            {
+                NavigableSet<Float> yMatches =
+                    xMatch.subSet(textY - tolerance , false, textY + tolerance , false);
+                if (!yMatches.isEmpty()) 
                 {
                     suppressCharacter = true;
+                    break;
                 }
             }
+
             if( !suppressCharacter )
             {
-                sameTextCharacters.add( text );
+                NavigableSet<Float> ySet = sameTextCharacters.get(textX);
+                if (ySet == null) 
+                {
+                    ySet = new TreeSet<Float>();
+                    sameTextCharacters.put( textX,  ySet );
+                }
+                ySet.add( textY );
                 showCharacter = true;
             }
         }



Re: svn commit: r1070125 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Posted by Jeremias Maerki <de...@jeremias-maerki.ch>.
Andreas,
I'm afraid this breaks compatibility with Java 5. java.util.Navigable*
was introduced with Java 6. We're still at Java 5, aren't we? README.txt
says so anyway.

Thanks!

On 12.02.2011 19:47:06 lehmi wrote:
> Author: lehmi
> Date: Sat Feb 12 18:47:06 2011
> New Revision: 1070125
> 
> URL: http://svn.apache.org/viewvc?rev=1070125&view=rev
> Log:
> PDFBOX-956: improved suppress duplicates algorithm when extracting text as proposed by Kevin Jackson
> 
> Modified:
>     pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> 
> Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
> URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1070125&r1=1070124&r2=1070125&view=diff
> ==============================================================================
> --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
> +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sat Feb 12 18:47:06 2011
> @@ -26,7 +26,11 @@ import java.util.Iterator;
>  import java.util.LinkedList;
>  import java.util.List;
>  import java.util.Map;
> +import java.util.NavigableMap;
> +import java.util.NavigableSet;
>  import java.util.Properties;
> +import java.util.TreeMap;
> +import java.util.TreeSet;
>  import java.util.Vector;
>  import java.util.regex.Pattern;
>  
> @@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
>       */
>      protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
>  
> -    private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
> +    private Map<String, NavigableMap<Float, NavigableSet<Float>>> characterListMapping =
> +        new HashMap<String, NavigableMap<Float, NavigableSet<Float>>>();
>  
>      /**
>       * encoding that text will be written in (or null).
> @@ -880,10 +885,10 @@ public class PDFTextStripper extends PDF
>              String textCharacter = text.getCharacter();
>              float textX = text.getX();
>              float textY = text.getY();
> -            List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
> +            NavigableMap<Float, NavigableSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
>              if( sameTextCharacters == null )
>              {
> -                sameTextCharacters = new ArrayList<TextPosition>();
> +                sameTextCharacters = new TreeMap<Float, NavigableSet<Float>>();
>                  characterListMapping.put( textCharacter, sameTextCharacters );
>              }
>  
> @@ -900,27 +905,29 @@ public class PDFTextStripper extends PDF
>              //
>              boolean suppressCharacter = false;
>              float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
> -            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
> -            {
> -                TextPosition character = sameTextCharacters.get( i );
> -                String charCharacter = character.getCharacter();
> -                float charX = character.getX();
> -                float charY = character.getY();
> -                //only want to suppress
> -
> -                if( charCharacter != null &&
> -                        //charCharacter.equals( textCharacter ) &&
> -                        within( charX, textX, tolerance ) &&
> -                        within( charY,
> -                                textY,
> -                                tolerance ) )
> +            
> +            NavigableMap<Float, NavigableSet<Float>> xMatches =
> +                sameTextCharacters.subMap(textX - tolerance , false, textX + tolerance , false);
> +            for (NavigableSet<Float> xMatch : xMatches.values()) 
> +            {
> +                NavigableSet<Float> yMatches =
> +                    xMatch.subSet(textY - tolerance , false, textY + tolerance , false);
> +                if (!yMatches.isEmpty()) 
>                  {
>                      suppressCharacter = true;
> +                    break;
>                  }
>              }
> +
>              if( !suppressCharacter )
>              {
> -                sameTextCharacters.add( text );
> +                NavigableSet<Float> ySet = sameTextCharacters.get(textX);
> +                if (ySet == null) 
> +                {
> +                    ySet = new TreeSet<Float>();
> +                    sameTextCharacters.put( textX,  ySet );
> +                }
> +                ySet.add( textY );
>                  showCharacter = true;
>              }
>          }
> 




Jeremias Maerki