You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/09/26 22:41:00 UTC
svn commit: r1627880 - in
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox:
text/TextNormalize.java text/TextPosition.java
util/PDFMarkedContentExtractor.java util/PDFTextStripper.java
Author: jahewson
Date: Fri Sep 26 20:41:00 2014
New Revision: 1627880
URL: http://svn.apache.org/r1627880
Log:
PDFBOX-2384: Refactor TextNormalize class
Removed:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1627880&r1=1627879&r2=1627880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java Fri Sep 26 20:41:00 2014
@@ -16,6 +16,8 @@
*/
package org.apache.pdfbox.text;
+import java.text.Normalizer;
+import java.util.HashMap;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
@@ -26,6 +28,50 @@ import org.apache.pdfbox.util.Matrix;
*/
public final class TextPosition
{
+ private static final HashMap<Integer, String> DIACRITICS = createDiacritics();
+
+ // Adds non-decomposing diacritics to the hash with their related combining character.
+ // These are values that the unicode spec claims are equivalent but are not mapped in the form
+ // NFKC normalization method. Determined by going through the Combining Diacritical Marks
+ // section of the Unicode spec and identifying which characters are not mapped to by the
+ // normalization.
+ private static HashMap<Integer, String> createDiacritics()
+ {
+ HashMap<Integer, String> map = new HashMap<Integer, String>();
+ map.put(0x0060, "\u0300");
+ map.put(0x02CB, "\u0300");
+ map.put(0x0027, "\u0301");
+ map.put(0x02B9, "\u0301");
+ map.put(0x02CA, "\u0301");
+ map.put(0x005e, "\u0302");
+ map.put(0x02C6, "\u0302");
+ map.put(0x007E, "\u0303");
+ map.put(0x02C9, "\u0304");
+ map.put(0x00B0, "\u030A");
+ map.put(0x02BA, "\u030B");
+ map.put(0x02C7, "\u030C");
+ map.put(0x02C8, "\u030D");
+ map.put(0x0022, "\u030E");
+ map.put(0x02BB, "\u0312");
+ map.put(0x02BC, "\u0313");
+ map.put(0x0486, "\u0313");
+ map.put(0x055A, "\u0313");
+ map.put(0x02BD, "\u0314");
+ map.put(0x0485, "\u0314");
+ map.put(0x0559, "\u0314");
+ map.put(0x02D4, "\u031D");
+ map.put(0x02D5, "\u031E");
+ map.put(0x02D6, "\u031F");
+ map.put(0x02D7, "\u0320");
+ map.put(0x02B2, "\u0321");
+ map.put(0x02CC, "\u0329");
+ map.put(0x02B7, "\u032B");
+ map.put(0x02CD, "\u0331");
+ map.put(0x005F, "\u0332");
+ map.put(0x204E, "\u0359");
+ return map;
+ }
+
// text matrix for the start of the text object, coordinates are in display units
// and have not been adjusted
private final Matrix textMatrix;
@@ -473,9 +519,8 @@ public final class TextPosition
* contains() method to test if two objects overlap.
*
* @param diacritic TextPosition to merge into the current TextPosition.
- * @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
- public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
+ public void mergeDiacritic(TextPosition diacritic)
{
if (diacritic.getUnicode().length() > 1)
{
@@ -501,7 +546,7 @@ public final class TextPosition
{
if (i == 0)
{
- insertDiacritic(i, diacritic, normalize);
+ insertDiacritic(i, diacritic);
}
else
{
@@ -513,11 +558,11 @@ public final class TextPosition
if (percentage1 >= percentage2)
{
- insertDiacritic(i, diacritic, normalize);
+ insertDiacritic(i, diacritic);
}
else
{
- insertDiacritic(i - 1, diacritic, normalize);
+ insertDiacritic(i - 1, diacritic);
}
}
wasAdded = true;
@@ -526,20 +571,20 @@ public final class TextPosition
// character the diacritic belongs to
else if (diacXStart < currCharXStart && diacXEnd > currCharXEnd)
{
- insertDiacritic(i, diacritic, normalize);
+ insertDiacritic(i, diacritic);
wasAdded = true;
}
// otherwise, The diacritic modifies this character because its completely
// contained by the character width
else if (diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
{
- insertDiacritic(i, diacritic, normalize);
+ insertDiacritic(i, diacritic);
wasAdded = true;
}
// last character in the TextPosition so we add diacritic to the end
else if (diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == strLen - 1)
{
- insertDiacritic(i, diacritic, normalize);
+ insertDiacritic(i, diacritic);
wasAdded = true;
}
@@ -554,9 +599,8 @@ public final class TextPosition
*
* @param i current character
* @param diacritic The diacritic TextPosition
- * @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
- private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
+ private void insertDiacritic(int i, TextPosition diacritic)
{
// we add the diacritic to the right or left of the character depending on the direction
// of the character. Note that this is only required because the text is currently stored in
@@ -574,7 +618,7 @@ public final class TextPosition
dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
{
- sb.append(normalize.normalizeDiacritic(diacritic.getUnicode()));
+ sb.append(combineDiacritic(diacritic.getUnicode()));
widths2[i] = 0;
sb.append(unicode.charAt(i));
widths2[i + 1] = widths[i];
@@ -583,7 +627,7 @@ public final class TextPosition
{
sb.append(unicode.charAt(i));
widths2[i] = widths[i];
- sb.append(normalize.normalizeDiacritic(diacritic.getUnicode()));
+ sb.append(combineDiacritic(diacritic.getUnicode()));
widths2[i + 1] = 0;
}
@@ -596,6 +640,29 @@ public final class TextPosition
}
/**
+ * Combine the diacritic, for example, convert non-combining diacritic characters to their
+ * combining counterparts.
+ *
+ * @param str String to normalize
+ * @return Normalized string
+ */
+ private String combineDiacritic(String str)
+ {
+ // Unicode contains special combining forms of the diacritic characters which we want to use
+ int codePoint = str.codePointAt(0);
+
+ // convert the characters not defined in the Unicode spec
+ if (DIACRITICS.containsKey(codePoint))
+ {
+ return DIACRITICS.get(codePoint);
+ }
+ else
+ {
+ return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
+ }
+ }
+
+ /**
* @return True if the current character is a diacritic char.
*/
public boolean isDiacritic()
@@ -609,7 +676,8 @@ public final class TextPosition
return type == Character.NON_SPACING_MARK ||
type == Character.MODIFIER_SYMBOL ||
type == Character.MODIFIER_LETTER;
- }
+
+ }
/**
* Show the string data for this text position.
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java?rev=1627880&r1=1627879&r2=1627880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java Fri Sep 26 20:41:00 2014
@@ -28,7 +28,6 @@ import org.apache.pdfbox.cos.COSDictiona
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.text.TextNormalize;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
@@ -47,14 +46,7 @@ public class PDFMarkedContentExtractor e
private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
/**
- * The normalizer is used to remove text ligatures/presentation forms
- * and to correct the direction of right to left text, such as Arabic and Hebrew.
- */
- private TextNormalize normalize = null;
-
- /**
- * Instantiate a new PDFTextStripper object. Will not do anything special to convert
- * the text to a more encoding-specific output.
+ * Instantiate a new PDFTextStripper object.
*/
public PDFMarkedContentExtractor() throws IOException
{
@@ -73,8 +65,6 @@ public class PDFMarkedContentExtractor e
addOperator(new EndMarkedContentSequence());
// todo: DP - Marked Content Point
// todo: MP - Marked Content Point with Properties
-
- this.normalize = new TextNormalize();
}
/**
@@ -209,13 +199,13 @@ public class PDFMarkedContentExtractor e
TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
if(text.isDiacritic() && previousTextPosition.contains(text))
{
- previousTextPosition.mergeDiacritic(text, this.normalize);
+ previousTextPosition.mergeDiacritic(text);
}
/* If the previous TextPosition was the diacritic, merge it into this
* one and remove it from the list. */
else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
- text.mergeDiacritic(previousTextPosition, this.normalize);
+ text.mergeDiacritic(previousTextPosition);
textList.remove(textList.size()-1);
textList.add(text);
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1627880&r1=1627879&r2=1627880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Fri Sep 26 20:41:00 2014
@@ -19,6 +19,7 @@ package org.apache.pdfbox.util;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
+import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -44,7 +45,6 @@ import org.apache.pdfbox.pdmodel.encrypt
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-import org.apache.pdfbox.text.TextNormalize;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;
@@ -167,25 +167,17 @@ public class PDFTextStripper extends PDF
protected Writer output;
/**
- * The normalizer is used to remove text ligatures/presentation forms
- * and to correct the direction of right to left text, such as Arabic and Hebrew.
- */
- private TextNormalize normalize = null;
-
- /**
* True if we started a paragraph but haven't ended it yet.
*/
private boolean inParagraph;
/**
- * Instantiate a new PDFTextStripper object. Will not do
- * anything special to convert the text to a more encoding-specific output.
+ * Instantiate a new PDFTextStripper object.
*
* @throws IOException If there is an error loading the properties.
*/
public PDFTextStripper() throws IOException
{
- normalize = new TextNormalize();
}
/**
@@ -934,13 +926,13 @@ public class PDFTextStripper extends PDF
TextPosition previousTextPosition = textList.get(textList.size() - 1);
if (text.isDiacritic() && previousTextPosition.contains(text))
{
- previousTextPosition.mergeDiacritic(text, normalize);
+ previousTextPosition.mergeDiacritic(text);
}
// If the previous TextPosition was the diacritic, merge it into this
// one and remove it from the list.
else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
- text.mergeDiacritic(previousTextPosition, normalize);
+ text.mergeDiacritic(previousTextPosition);
textList.remove(textList.size()-1);
textList.add(text);
}
@@ -1783,7 +1775,61 @@ public class PDFTextStripper extends PDF
*/
private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
{
- return new WordWithTextPositions(normalize.normalizePresentationForm(word), wordPositions);
+ return new WordWithTextPositions(normalizeWord(word), wordPositions);
+ }
+
+ /**
+ * Normalize certain Unicode characters. For example, convert the
+ * single "fi" ligature to "f" and "i". Also normalises Arabic and Hebrew presentation forms.
+ *
+ * @param word Word to normalize
+ * @return Normalized word
+ */
+ private String normalizeWord(String word)
+ {
+ StringBuilder builder = null;
+ int p = 0;
+ int q = 0;
+ int strLength = word.length();
+ for (; q < strLength; q++)
+ {
+ // We only normalize if the codepoint is in a given range.
+ // Otherwise, NFKC converts too many things that would cause
+ // confusion. For example, it converts the micro symbol in
+ // extended Latin to the value in the Greek script. We normalize
+ // the Unicode Alphabetic and Arabic A&B Presentation forms.
+ char c = word.charAt(q);
+ if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF)
+ {
+ if (builder == null)
+ {
+ builder = new StringBuilder(strLength * 2);
+ }
+ builder.append(word.substring(p, q));
+ // Some fonts map U+FDF2 differently than the Unicode spec.
+ // They add an extra U+0627 character to compensate.
+ // This removes the extra character for those fonts.
+ if(c == 0xFDF2 && q > 0 && (word.charAt(q-1) == 0x0627 || word.charAt(q-1) == 0xFE8D))
+ {
+ builder.append("\u0644\u0644\u0647");
+ }
+ else
+ {
+ // Trim because some decompositions have an extra space, such as U+FC5E
+ builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
+ }
+ p = q + 1;
+ }
+ }
+ if (builder == null)
+ {
+ return word;
+ }
+ else
+ {
+ builder.append(word.substring(p, q));
+ return builder.toString();
+ }
}
/**