You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/09/26 22:41:00 UTC

svn commit: r1627880 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: text/TextNormalize.java text/TextPosition.java util/PDFMarkedContentExtractor.java util/PDFTextStripper.java

Author: jahewson
Date: Fri Sep 26 20:41:00 2014
New Revision: 1627880

URL: http://svn.apache.org/r1627880
Log:
PDFBOX-2384: Refactor TextNormalize class

Removed:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1627880&r1=1627879&r2=1627880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java Fri Sep 26 20:41:00 2014
@@ -16,6 +16,8 @@
  */
 package org.apache.pdfbox.text;
 
+import java.text.Normalizer;
+import java.util.HashMap;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.util.Matrix;
 
@@ -26,6 +28,50 @@ import org.apache.pdfbox.util.Matrix;
  */
 public final class TextPosition
 {
+    private static final HashMap<Integer, String> DIACRITICS = createDiacritics();
+
+    // Adds non-decomposing diacritics to the hash with their related combining character.
+    // These are values that the unicode spec claims are equivalent but are not mapped in the form
+    // NFKC normalization method. Determined by going through the Combining Diacritical Marks
+    // section of the Unicode spec and identifying which characters are not  mapped to by the
+    // normalization.
+    private static HashMap<Integer, String> createDiacritics()
+    {
+        HashMap<Integer, String> map = new HashMap<Integer, String>();
+        map.put(0x0060, "\u0300");
+        map.put(0x02CB, "\u0300");
+        map.put(0x0027, "\u0301");
+        map.put(0x02B9, "\u0301");
+        map.put(0x02CA, "\u0301");
+        map.put(0x005e, "\u0302");
+        map.put(0x02C6, "\u0302");
+        map.put(0x007E, "\u0303");
+        map.put(0x02C9, "\u0304");
+        map.put(0x00B0, "\u030A");
+        map.put(0x02BA, "\u030B");
+        map.put(0x02C7, "\u030C");
+        map.put(0x02C8, "\u030D");
+        map.put(0x0022, "\u030E");
+        map.put(0x02BB, "\u0312");
+        map.put(0x02BC, "\u0313");
+        map.put(0x0486, "\u0313");
+        map.put(0x055A, "\u0313");
+        map.put(0x02BD, "\u0314");
+        map.put(0x0485, "\u0314");
+        map.put(0x0559, "\u0314");
+        map.put(0x02D4, "\u031D");
+        map.put(0x02D5, "\u031E");
+        map.put(0x02D6, "\u031F");
+        map.put(0x02D7, "\u0320");
+        map.put(0x02B2, "\u0321");
+        map.put(0x02CC, "\u0329");
+        map.put(0x02B7, "\u032B");
+        map.put(0x02CD, "\u0331");
+        map.put(0x005F, "\u0332");
+        map.put(0x204E, "\u0359");
+        return map;
+    }
+
     // text matrix for the start of the text object, coordinates are in display units
     // and have not been adjusted
     private final Matrix textMatrix;
@@ -473,9 +519,8 @@ public final class TextPosition
      * contains() method to test if two objects overlap.
      *
      * @param diacritic TextPosition to merge into the current TextPosition.
-     * @param normalize Instance of TextNormalize class to be used to normalize diacritic
      */
-    public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
+    public void mergeDiacritic(TextPosition diacritic)
     {
         if (diacritic.getUnicode().length() > 1)
         {
@@ -501,7 +546,7 @@ public final class TextPosition
             {
                 if (i == 0)
                 {
-                    insertDiacritic(i, diacritic, normalize);
+                    insertDiacritic(i, diacritic);
                 }
                 else
                 {
@@ -513,11 +558,11 @@ public final class TextPosition
 
                     if (percentage1 >= percentage2)
                     {
-                        insertDiacritic(i, diacritic, normalize);
+                        insertDiacritic(i, diacritic);
                     }
                     else
                     {
-                        insertDiacritic(i - 1, diacritic, normalize);
+                        insertDiacritic(i - 1, diacritic);
                     }
                 }
                 wasAdded = true;
@@ -526,20 +571,20 @@ public final class TextPosition
             // character the diacritic belongs to
             else if (diacXStart < currCharXStart && diacXEnd > currCharXEnd)
             {
-                insertDiacritic(i, diacritic, normalize);
+                insertDiacritic(i, diacritic);
                 wasAdded = true;
             }
             // otherwise, The diacritic modifies this character because its completely
             // contained by the character width
             else if (diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
             {
-                insertDiacritic(i, diacritic, normalize);
+                insertDiacritic(i, diacritic);
                 wasAdded = true;
             }
             // last character in the TextPosition so we add diacritic to the end
             else if (diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == strLen - 1)
             {
-                insertDiacritic(i, diacritic, normalize);
+                insertDiacritic(i, diacritic);
                 wasAdded = true;
             }
 
@@ -554,9 +599,8 @@ public final class TextPosition
      *
      * @param i current character
      * @param diacritic The diacritic TextPosition
-     * @param normalize Instance of TextNormalize class to be used to normalize diacritic
      */
-    private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
+    private void insertDiacritic(int i, TextPosition diacritic)
     {
         // we add the diacritic to the right or left of the character depending on the direction
         // of the character. Note that this is only required because the text is currently stored in
@@ -574,7 +618,7 @@ public final class TextPosition
             dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
             dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
         {
-            sb.append(normalize.normalizeDiacritic(diacritic.getUnicode()));
+            sb.append(combineDiacritic(diacritic.getUnicode()));
             widths2[i] = 0;
             sb.append(unicode.charAt(i));
             widths2[i + 1] = widths[i];
@@ -583,7 +627,7 @@ public final class TextPosition
         {
             sb.append(unicode.charAt(i));
             widths2[i] = widths[i];
-            sb.append(normalize.normalizeDiacritic(diacritic.getUnicode()));
+            sb.append(combineDiacritic(diacritic.getUnicode()));
             widths2[i + 1] = 0;
         }
 
@@ -596,6 +640,29 @@ public final class TextPosition
     }
 
     /**
+     * Combine the diacritic, for example, convert non-combining diacritic characters to their
+     * combining counterparts.
+     *
+     * @param str String to normalize
+     * @return Normalized string
+     */
+    private String combineDiacritic(String str)
+    {
+        // Unicode contains special combining forms of the diacritic characters which we want to use
+        int codePoint = str.codePointAt(0);
+
+        // convert the characters not defined in the Unicode spec
+        if (DIACRITICS.containsKey(codePoint))
+        {
+            return DIACRITICS.get(codePoint);
+        }
+        else
+        {
+            return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
+        }
+    }
+
+    /**
      * @return True if the current character is a diacritic char.
      */
     public boolean isDiacritic()
@@ -609,7 +676,8 @@ public final class TextPosition
         return type == Character.NON_SPACING_MARK ||
                type == Character.MODIFIER_SYMBOL ||
                type == Character.MODIFIER_LETTER;
-    }
+
+  }
 
     /**
      * Show the string data for this text position.

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java?rev=1627880&r1=1627879&r2=1627880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java Fri Sep 26 20:41:00 2014
@@ -28,7 +28,6 @@ import org.apache.pdfbox.cos.COSDictiona
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.text.TextNormalize;
 import org.apache.pdfbox.text.TextPosition;
 import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
 import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
@@ -47,14 +46,7 @@ public class PDFMarkedContentExtractor e
     private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
 
     /**
-     * The normalizer is used to remove text ligatures/presentation forms
-     * and to correct the direction of right to left text, such as Arabic and Hebrew.
-     */
-    private TextNormalize normalize = null;
-
-    /**
-     * Instantiate a new PDFTextStripper object. Will not do anything special to convert
-     * the text to a more encoding-specific output.
+     * Instantiate a new PDFTextStripper object.
      */
     public PDFMarkedContentExtractor() throws IOException
     {
@@ -73,8 +65,6 @@ public class PDFMarkedContentExtractor e
         addOperator(new EndMarkedContentSequence());
         // todo: DP - Marked Content Point
         // todo: MP - Marked Content Point with Properties
-
-        this.normalize = new TextNormalize();
     }
 
     /**
@@ -209,13 +199,13 @@ public class PDFMarkedContentExtractor e
                 TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
                 if(text.isDiacritic() && previousTextPosition.contains(text))
                 {
-                    previousTextPosition.mergeDiacritic(text, this.normalize);
+                    previousTextPosition.mergeDiacritic(text);
                 }
                 /* If the previous TextPosition was the diacritic, merge it into this
                  * one and remove it from the list. */
                 else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                 {
-                    text.mergeDiacritic(previousTextPosition, this.normalize);
+                    text.mergeDiacritic(previousTextPosition);
                     textList.remove(textList.size()-1);
                     textList.add(text);
                 }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1627880&r1=1627879&r2=1627880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Fri Sep 26 20:41:00 2014
@@ -19,6 +19,7 @@ package org.apache.pdfbox.util;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.io.Writer;
+import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -44,7 +45,6 @@ import org.apache.pdfbox.pdmodel.encrypt
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-import org.apache.pdfbox.text.TextNormalize;
 import org.apache.pdfbox.text.TextPosition;
 import org.apache.pdfbox.text.TextPositionComparator;
 
@@ -167,25 +167,17 @@ public class PDFTextStripper extends PDF
     protected Writer output;
 
     /**
-     * The normalizer is used to remove text ligatures/presentation forms
-     * and to correct the direction of right to left text, such as Arabic and Hebrew.
-     */
-    private TextNormalize normalize = null;
-
-    /**
      * True if we started a paragraph but haven't ended it yet.
      */
     private boolean inParagraph;
 
     /**
-     * Instantiate a new PDFTextStripper object. Will not do
-     * anything special to convert the text to a more encoding-specific output.
+     * Instantiate a new PDFTextStripper object.
      *
      * @throws IOException If there is an error loading the properties.
      */
     public PDFTextStripper() throws IOException
     {
-        normalize = new TextNormalize();
     }
 
     /**
@@ -934,13 +926,13 @@ public class PDFTextStripper extends PDF
                 TextPosition previousTextPosition = textList.get(textList.size() - 1);
                 if (text.isDiacritic() && previousTextPosition.contains(text))
                 {
-                    previousTextPosition.mergeDiacritic(text, normalize);
+                    previousTextPosition.mergeDiacritic(text);
                 }
                 // If the previous TextPosition was the diacritic, merge it into this
                 // one and remove it from the list.
                 else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                 {
-                    text.mergeDiacritic(previousTextPosition, normalize);
+                    text.mergeDiacritic(previousTextPosition);
                     textList.remove(textList.size()-1);
                     textList.add(text);
                 }
@@ -1783,7 +1775,61 @@ public class PDFTextStripper extends PDF
      */
     private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
     {
-        return new WordWithTextPositions(normalize.normalizePresentationForm(word), wordPositions);
+        return new WordWithTextPositions(normalizeWord(word), wordPositions);
+    }
+
+    /**
+     * Normalize certain Unicode characters. For example, convert the
+     * single "fi" ligature to "f" and "i". Also normalises Arabic and Hebrew presentation forms.
+     *
+     * @param word Word to normalize
+     * @return Normalized word
+     */
+    private String normalizeWord(String word)
+    {
+        StringBuilder builder = null;
+        int p = 0;
+        int q = 0;
+        int strLength = word.length();
+        for (; q < strLength; q++)
+        {
+            // We only normalize if the codepoint is in a given range.
+            // Otherwise, NFKC converts too many things that would cause
+            // confusion. For example, it converts the micro symbol in
+            // extended Latin to the value in the Greek script. We normalize
+            // the Unicode Alphabetic and Arabic A&B Presentation forms.
+            char c = word.charAt(q);
+            if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF)
+            {
+                if (builder == null)
+                {
+                    builder = new StringBuilder(strLength * 2);
+                }
+                builder.append(word.substring(p, q));
+                // Some fonts map U+FDF2 differently than the Unicode spec.
+                // They add an extra U+0627 character to compensate.
+                // This removes the extra character for those fonts.
+                if(c == 0xFDF2 && q > 0 && (word.charAt(q-1) == 0x0627 || word.charAt(q-1) == 0xFE8D))
+                {
+                    builder.append("\u0644\u0644\u0647");
+                }
+                else
+                {
+                    // Trim because some decompositions have an extra space, such as U+FC5E
+                    builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
+                }
+                p = q + 1;
+            }
+        }
+        if (builder == null)
+        {
+            return word;
+        }
+        else
+        {
+            builder.append(word.substring(p, q));
+            return builder.toString();
+        }
     }
 
     /**