You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2009/07/28 20:25:33 UTC
svn commit: r798640 [2/2] - in
/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox:
pdmodel/graphics/color/ pdmodel/graphics/xobject/ persistence/util/ util/
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java?rev=798640&r1=798639&r2=798640&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java Tue Jul 28 18:25:32 2009
@@ -22,29 +22,37 @@
* This class allows a caller to normalize text in various ways.
* It will load the ICU4J jar file if it is defined on the classpath.
*
+ * @author <a href="mailto:carrier@digital-evidence.org">Brian Carrier</a>
+ * @version $Revision: 1.0 $
*/
-public class TextNormalize {
+public class TextNormalize
+{
private ICU4JImpl icu4j = null;
- private static final HashMap diacHash = new HashMap();
- private String encoding;
+ private static final HashMap DIACHASH = new HashMap();
+ private String outputEncoding;
/**
*
* @param encoding The Encoding that the text will eventually be written as (or null)
*/
- public TextNormalize(String encoding) {
+ public TextNormalize(String encoding)
+ {
findICU4J();
populateDiacHash();
- this.encoding = encoding;
+ this.outputEncoding = encoding;
}
- private void findICU4J() {
+ private void findICU4J()
+ {
// see if we can load the icu4j classes from the classpath
- try {
+ try
+ {
this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Bidi");
this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Normalizer");
icu4j = new ICU4JImpl();
- } catch (ClassNotFoundException e) {
+ }
+ catch (ClassNotFoundException e)
+ {
icu4j = null;
}
}
@@ -56,38 +64,39 @@
* the Unicode spec and identifying which characters are not mapped to by
* the normalization.
*/
- private void populateDiacHash(){
- diacHash.put(new Integer(0x0060), "\u0300");
- diacHash.put(new Integer(0x02CB), "\u0300");
- diacHash.put(new Integer(0x0027), "\u0301");
- diacHash.put(new Integer(0x02B9), "\u0301");
- diacHash.put(new Integer(0x02CA), "\u0301");
- diacHash.put(new Integer(0x005e), "\u0302");
- diacHash.put(new Integer(0x02C6), "\u0302");
- diacHash.put(new Integer(0x007E), "\u0303");
- diacHash.put(new Integer(0x02C9), "\u0304");
- diacHash.put(new Integer(0x00B0), "\u030A");
- diacHash.put(new Integer(0x02BA), "\u030B");
- diacHash.put(new Integer(0x02C7), "\u030C");
- diacHash.put(new Integer(0x02C8), "\u030D");
- diacHash.put(new Integer(0x0022), "\u030E");
- diacHash.put(new Integer(0x02BB), "\u0312");
- diacHash.put(new Integer(0x02BC), "\u0313");
- diacHash.put(new Integer(0x0486), "\u0313");
- diacHash.put(new Integer(0x055A), "\u0313");
- diacHash.put(new Integer(0x02BD), "\u0314");
- diacHash.put(new Integer(0x0485), "\u0314");
- diacHash.put(new Integer(0x0559), "\u0314");
- diacHash.put(new Integer(0x02D4), "\u031D");
- diacHash.put(new Integer(0x02D5), "\u031E");
- diacHash.put(new Integer(0x02D6), "\u031F");
- diacHash.put(new Integer(0x02D7), "\u0320");
- diacHash.put(new Integer(0x02B2), "\u0321");
- diacHash.put(new Integer(0x02CC), "\u0329");
- diacHash.put(new Integer(0x02B7), "\u032B");
- diacHash.put(new Integer(0x02CD), "\u0331");
- diacHash.put(new Integer(0x005F), "\u0332");
- diacHash.put(new Integer(0x204E), "\u0359");
+ private void populateDiacHash()
+ {
+ DIACHASH.put(new Integer(0x0060), "\u0300");
+ DIACHASH.put(new Integer(0x02CB), "\u0300");
+ DIACHASH.put(new Integer(0x0027), "\u0301");
+ DIACHASH.put(new Integer(0x02B9), "\u0301");
+ DIACHASH.put(new Integer(0x02CA), "\u0301");
+ DIACHASH.put(new Integer(0x005e), "\u0302");
+ DIACHASH.put(new Integer(0x02C6), "\u0302");
+ DIACHASH.put(new Integer(0x007E), "\u0303");
+ DIACHASH.put(new Integer(0x02C9), "\u0304");
+ DIACHASH.put(new Integer(0x00B0), "\u030A");
+ DIACHASH.put(new Integer(0x02BA), "\u030B");
+ DIACHASH.put(new Integer(0x02C7), "\u030C");
+ DIACHASH.put(new Integer(0x02C8), "\u030D");
+ DIACHASH.put(new Integer(0x0022), "\u030E");
+ DIACHASH.put(new Integer(0x02BB), "\u0312");
+ DIACHASH.put(new Integer(0x02BC), "\u0313");
+ DIACHASH.put(new Integer(0x0486), "\u0313");
+ DIACHASH.put(new Integer(0x055A), "\u0313");
+ DIACHASH.put(new Integer(0x02BD), "\u0314");
+ DIACHASH.put(new Integer(0x0485), "\u0314");
+ DIACHASH.put(new Integer(0x0559), "\u0314");
+ DIACHASH.put(new Integer(0x02D4), "\u031D");
+ DIACHASH.put(new Integer(0x02D5), "\u031E");
+ DIACHASH.put(new Integer(0x02D6), "\u031F");
+ DIACHASH.put(new Integer(0x02D7), "\u0320");
+ DIACHASH.put(new Integer(0x02B2), "\u0321");
+ DIACHASH.put(new Integer(0x02CC), "\u0329");
+ DIACHASH.put(new Integer(0x02B7), "\u032B");
+ DIACHASH.put(new Integer(0x02CD), "\u0331");
+ DIACHASH.put(new Integer(0x005F), "\u0332");
+ DIACHASH.put(new Integer(0x204E), "\u0359");
}
/**
@@ -97,16 +106,19 @@
* if the text involves both RTL and LTR text then the Unicode BIDI algorithm
* must be used to determine how to map between them.
*
- * @param a_str Presentation form of line to convert (i.e. left most char is first char)
- * @param a_isRtlDominant true if the PAGE has a dominant right to left ordering
+ * @param str Presentation form of line to convert (i.e. left most char is first char)
+ * @param isRtlDominant true if the PAGE has a dominant right to left ordering
* @return Logical form of string (or original string if ICU4J library is not on classpath)
*/
- public String makeLineLogicalOrder(String a_str, boolean a_isRtlDominant) {
- if (icu4j != null) {
- return icu4j.makeLineLogicalOrder(a_str, a_isRtlDominant);
- }
- else {
- return a_str;
+ public String makeLineLogicalOrder(String str, boolean isRtlDominant)
+ {
+ if (icu4j != null)
+ {
+ return icu4j.makeLineLogicalOrder(str, isRtlDominant);
+ }
+ else
+ {
+ return str;
}
}
@@ -114,15 +126,18 @@
* Normalize the presentation forms of characters in the string.
* For example, convert the single "fi" ligature to "f" and "i".
*
- * @param a_str String to normalize
+ * @param str String to normalize
* @return Normalized string (or original string if ICU4J library is not on classpath)
*/
- public String normalizePres(String a_str) {
- if (icu4j != null) {
- return icu4j.normalizePres(a_str);
- }
- else {
- return a_str;
+ public String normalizePres(String str)
+ {
+ if (icu4j != null)
+ {
+ return icu4j.normalizePres(str);
+ }
+ else
+ {
+ return str;
}
}
@@ -131,29 +146,35 @@
* convert non-combining diacritic characters to their combining
* counterparts.
*
- * @param a_str String to normalize
+ * @param str String to normalize
* @return Normalized string (or original string if ICU4J library is not on classpath)
*/
- public String normalizeDiac(String a_str){
+ public String normalizeDiac(String str)
+ {
/*
* Unicode contains special combining forms of the diacritic characters
* and we want to use these.
*/
- if(encoding != null && encoding.toUpperCase().startsWith("UTF")){
- Integer c = new Integer(a_str.charAt(0));
+ if(outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF"))
+ {
+ Integer c = new Integer(str.charAt(0));
// convert the characters not defined in the Unicode spec
- if(diacHash.containsKey(c)){
- return (String)diacHash.get(c);
+ if(DIACHASH.containsKey(c))
+ {
+ return (String)DIACHASH.get(c);
}
- else if (icu4j != null) {
- return icu4j.normalizeDiac(a_str);
+ else if (icu4j != null)
+ {
+ return icu4j.normalizeDiac(str);
}
- else {
- return a_str;
+ else
+ {
+ return str;
}
}
- else{
- return a_str;
+ else
+ {
+ return str;
}
}
}
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java?rev=798640&r1=798639&r2=798640&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java Tue Jul 28 18:25:32 2009
@@ -44,9 +44,12 @@
private String str;
private PDFont font;
private float fontSize;
- private int fontSizeInPt;
+ private int fontSizePt;
private float wordSpacing; // word spacing value, in display units
+ /**
+ * Constructor.
+ */
protected TextPosition()
{
@@ -64,7 +67,7 @@
* @param string The character to be displayed.
* @param currentFont The current for for this text position.
* @param fontSizeValue The new font size.
- * @param fontSizeValue The font size in pt units.
+ * @param fontSizeInPt The font size in pt units.
* @param ws The word spacing parameter (in display units)
*/
public TextPosition(
@@ -89,7 +92,9 @@
this.rot = page.findRotation();
// make sure it is 0 to 270 and no negative numbers
if(this.rot < 0)
+ {
rot += 360;
+ }
this.maxTextHeight = maxFontH;
this.pageHeight = page.findMediaBox().getHeight();
@@ -100,7 +105,7 @@
this.str = string;
this.font = currentFont;
this.fontSize = fontSizeValue;
- this.fontSizeInPt = fontSizeInPt;
+ this.fontSizePt = fontSizeInPt;
this.wordSpacing = ws;
}
@@ -115,11 +120,12 @@
}
/**
- * Return the Matrix textPos stored in this object
+ * Return the Matrix textPos stored in this object.
*
* @return The Matrix containing all infos of the starting textposition
*/
- public Matrix getTextPos() {
+ public Matrix getTextPos()
+ {
return textPos;
}
@@ -128,7 +134,8 @@
* based on its text matrix.
* @return The direction of the text (0, 90, 180, or 270)
*/
- public float getDir() {
+ public float getDir()
+ {
float a = textPos.getValue(0,0);
float b = textPos.getValue(0,1);
float c = textPos.getValue(1,0);
@@ -137,20 +144,27 @@
// 12 0 left to right
// 0 12
if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
+ {
return 0;
+ }
// -12 0 right to left (upside down)
// 0 -12
else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
+ {
return 180;
+ }
// 0 12 up
// -12 0
else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
+ {
return 90;
+ }
// 0 -12 down
// 12 0
else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
+ {
return 270;
-
+ }
return 0;
}
@@ -159,21 +173,28 @@
* the given rotation amount. The rotation adjusts where the 0,0
* location is relative to the text.
*
- * @param a_rot Rotation to apply (0, 90, 180, or 270). 0 will perform no adjustments.
+ * @param rotation Rotation to apply (0, 90, 180, or 270). 0 will perform no adjustments.
* @return X coordinate
*/
- private float getX_rot(float a_rot)
+ private float getXRot(float rotation)
{
- if (a_rot == 0)
+ if (rotation == 0)
+ {
return textPos.getValue(2,0);
- else if (a_rot == 90)
+ }
+ else if (rotation == 90)
+ {
return textPos.getValue(2,1);
- else if (a_rot == 180)
+ }
+ else if (rotation == 180)
+ {
return pageWidth - textPos.getValue(2,0);
- else if (a_rot == 270)
+ }
+ else if (rotation == 270)
+ {
return pageHeight - textPos.getValue(2,1);
- else
- return 0;
+ }
+ return 0;
}
/**
@@ -185,7 +206,7 @@
*/
public float getX()
{
- return getX_rot(rot);
+ return getXRot(rot);
}
/**
@@ -195,29 +216,37 @@
*
* @return The x coordinate of the text.
*/
- public float getXDirAdj() {
- return getX_rot(getDir());
+ public float getXDirAdj()
+ {
+ return getXRot(getDir());
}
/**
* This will get the y position of the character with 0,0 in lower left.
* This will be adjusted by the given rotation.
- * @param a_rot Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
+ * @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
*
* @return The y coordinate of the text
*/
- private float getY_ll_rot(float a_rot)
+ private float getYLowerLeftRot(float rotation)
{
- if (a_rot == 0)
+ if (rotation == 0)
+ {
return textPos.getValue(2,1);
- else if (a_rot == 90)
+ }
+ else if (rotation == 90)
+ {
return pageWidth - textPos.getValue(2,0);
- else if (a_rot == 180)
+ }
+ else if (rotation == 180)
+ {
return pageHeight - textPos.getValue(2,1);
- else if (a_rot == 270)
+ }
+ else if (rotation == 270)
+ {
return textPos.getValue(2,0);
- else
- return 0;
+ }
+ return 0;
}
/**
@@ -229,9 +258,13 @@
public float getY()
{
if ((rot == 0) || (rot == 180))
- return pageHeight - getY_ll_rot(rot);
- else
- return pageWidth - getY_ll_rot(rot);
+ {
+ return pageHeight - getYLowerLeftRot(rot);
+ }
+ else
+ {
+ return pageWidth - getYLowerLeftRot(rot);
+ }
}
/**
@@ -245,9 +278,13 @@
float dir = getDir();
// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
if ((dir == 0) || (dir == 180))
- return pageHeight - getY_ll_rot(dir);
+ {
+ return pageHeight - getYLowerLeftRot(dir);
+ }
else
- return pageWidth - getY_ll_rot(dir);
+ {
+ return pageWidth - getYLowerLeftRot(dir);
+ }
}
@@ -255,15 +292,17 @@
/**
* Get the length or width of the text, based on a given rotation.
*
- * @param a_rot Rotation that was used to determine coordinates (0,90,180,270)
+ * @param rotation Rotation that was used to determine coordinates (0,90,180,270)
* @return Width of text in display units
*/
- private float getWidth_rot(float a_rot)
+ private float getWidthRot(float rotation)
{
- if ((a_rot == 90) || (a_rot == 270)) {
+ if ((rotation == 90) || (rotation == 270))
+ {
return Math.abs(endY - textPos.getYPosition());
}
- else {
+ else
+ {
return Math.abs(endX - textPos.getXPosition());
}
}
@@ -273,8 +312,9 @@
*
* @return The width of the text in display units.
*/
- public float getWidth() {
- return getWidth_rot(rot);
+ public float getWidth()
+ {
+ return getWidthRot(rot);
}
/**
@@ -282,8 +322,9 @@
*
* @return The width of the text in display units.
*/
- public float getWidthDirAdj() {
- return getWidth_rot(getDir());
+ public float getWidthDirAdj()
+ {
+ return getWidthRot(getDir());
}
/**
@@ -291,7 +332,8 @@
*
* @return The maximum height of all characters in this string.
*/
- public float getHeight() {
+ public float getHeight()
+ {
return maxTextHeight;
}
@@ -300,7 +342,8 @@
*
* @return The maximum height of all characters in this string.
*/
- public float getHeightDir() {
+ public float getHeightDir()
+ {
// this is not really a rotation-dependent calculation, but this is defined for symmetry.
return maxTextHeight;
}
@@ -317,14 +360,14 @@
}
/**
- * This will get the font size in pt
+ * This will get the font size in pt.
* To get this size we have to multiply the pdf-fontsize and the scaling from the textmatrix
*
* @return The font size in pt.
*/
public float getFontSizeInPt()
{
- return fontSizeInPt;
+ return fontSizePt;
}
/**
@@ -414,7 +457,8 @@
/*
* No X overlap at all so return as soon as possible.
*/
- if(tp2Xend <= thisXstart || tp2Xstart >= thisXend){
+ if(tp2Xend <= thisXstart || tp2Xstart >= thisXend)
+ {
return false;
}
/*
@@ -423,19 +467,22 @@
* top of TextPosition
*/
if((tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj()) ||
- (tp2.getYDirAdj() > getYDirAdj() + getHeightDir())){
+ (tp2.getYDirAdj() > getYDirAdj() + getHeightDir()))
+ {
return false;
}
/* We're going to calculate the percentage of overlap. If its less
* than a 15% x-coordinate overlap then we'll return false because its negligible.
* .15 was determined by trial and error in the regression test files.
*/
- else if((tp2Xstart > thisXstart) && (tp2Xend > thisXend)){
+ else if((tp2Xstart > thisXstart) && (tp2Xend > thisXend))
+ {
double overlap = thisXend - tp2Xstart;
double overlapPercent = overlap/getWidthDirAdj();
return (overlapPercent > .15);
}
- else if((tp2Xstart < thisXstart) && (tp2Xend < thisXend)){
+ else if((tp2Xstart < thisXstart) && (tp2Xend < thisXend))
+ {
double overlap = tp2Xend - thisXstart;
double overlapPercent = overlap/getWidthDirAdj();
return (overlapPercent > .15);
@@ -453,10 +500,12 @@
* @param diacritic TextPosition to merge into the current TextPosition.
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
- public void mergeDiacritic (TextPosition diacritic, TextNormalize normalize)
+ public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
{
- if (diacritic.getCharacter().length() > 1)
+ if (diacritic.getCharacter().length() > 1)
+ {
return;
+ }
float diacXStart = diacritic.getXDirAdj();
float diacXEnd = diacXStart + diacritic.widths[0];
@@ -466,8 +515,8 @@
int strLen = str.length();
boolean wasAdded = false;
- for (int i = 0; i < strLen && wasAdded == false; i++) {
-
+ for (int i = 0; i < strLen && !wasAdded; i++)
+ {
float currCharXEnd = currCharXStart + widths[i];
/*
@@ -475,21 +524,26 @@
* the current character and the previous character. If no previous character,
* just append the diacritic after the current one.
*/
- if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd){
- if(i == 0){
+ if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
+ {
+ if(i == 0)
+ {
insertDiacritic(i, diacritic, normalize);
}
- else{
+ else
+ {
float distanceOverlapping1 = diacXEnd - currCharXStart;
float percentage1 = distanceOverlapping1/widths[i];
float distanceOverlapping2 = currCharXStart - diacXStart;
float percentage2 = distanceOverlapping2/widths[i-1];
- if(percentage1 >= percentage2){
+ if(percentage1 >= percentage2)
+ {
insertDiacritic(i, diacritic, normalize);
}
- else{
+ else
+ {
insertDiacritic(i-1, diacritic, normalize);
}
}
@@ -497,20 +551,23 @@
}
//diacritic completely covers this character and therefore we assume that
//this is the character the diacritic belongs to
- else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd){
+ else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd)
+ {
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
//Otherwise, The diacritic modifies this character because its completely
//contained by the character width
- else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd) {
+ else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
+ {
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Last character in the TextPosition so we add diacritic to the end
*/
- else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1)){
+ else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1))
+ {
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
@@ -528,7 +585,8 @@
* @param diacritic The diacritic TextPosition
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
- private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize){
+ private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
+ {
/* we add the diacritic to the right or left of the character
* depending on the direction of the character. Note that this
* is only required because the text is currently stored in
@@ -545,13 +603,15 @@
if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
- || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
+ || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE))
+ {
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i] = 0;
buf.append(str.charAt(i));
widths2[i+1] = widths[i];
}
- else {
+ else
+ {
buf.append(str.charAt(i));
widths2[i] = widths[i];
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
@@ -570,7 +630,8 @@
*
* @return True if the current character is a diacritic char.
*/
- public boolean isDiacritic() {
+ public boolean isDiacritic()
+ {
String cText = this.getCharacter();
return (cText.length() == 1 && (Character.getType(cText.charAt(0)) == Character.NON_SPACING_MARK
|| Character.getType(cText.charAt(0)) == Character.MODIFIER_SYMBOL
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java?rev=798640&r1=798639&r2=798640&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java Tue Jul 28 18:25:32 2009
@@ -40,10 +40,14 @@
/* Only compare text that is in the same direction. */
if (pos1.getDir() < pos2.getDir())
- return -1;
+ {
+ return -1;
+ }
else if (pos1.getDir() > pos2.getDir())
- return 1;
-
+ {
+ return 1;
+ }
+
// Get the text direction adjusted coordinates
float x1 = pos1.getXDirAdj();
float x2 = pos2.getXDirAdj();