You are viewing a plain text version of this content. The canonical link for it is here.
Posted to fop-commits@xmlgraphics.apache.org by gm...@apache.org on 2004/09/06 20:03:12 UTC
cvs commit: xml-fop/src/java/org/apache/fop/tools/anttasks SerializeHyphPattern.java
gmazza 2004/09/06 11:03:12
Modified: . build.xml
src/java/org/apache/fop/layoutmgr LineLayoutManager.java
src/java/org/apache/fop/tools/anttasks
SerializeHyphPattern.java
Added: src/java/org/apache/fop/hyphenation ByteVector.java
CharVector.java Hyphen.java Hyphenation.java
HyphenationException.java HyphenationTree.java
Hyphenator.java PatternConsumer.java
PatternParser.java TernaryTree.java
Removed: src/java/org/apache/fop/layout/hyphenation ByteVector.java
CharVector.java Hyphen.java Hyphenation.java
HyphenationException.java HyphenationTree.java
Hyphenator.java PatternConsumer.java
PatternParser.java TernaryTree.java
Log:
Moved hyphenation package to org.apache.fop.hyphenation
Revision Changes Path
1.110 +0 -2 xml-fop/build.xml
Index: build.xml
===================================================================
RCS file: /home/cvs/xml-fop/build.xml,v
retrieving revision 1.109
retrieving revision 1.110
diff -u -r1.109 -r1.110
--- build.xml 14 Aug 2004 18:50:37 -0000 1.109
+++ build.xml 6 Sep 2004 18:03:12 -0000 1.110
@@ -792,8 +792,6 @@
<group title="Layout">
<package name="org.apache.fop.layoutmgr"/>
<package name="org.apache.fop.layoutmgr.*"/>
- <package name="org.apache.fop.layout"/>
- <package name="org.apache.fop.layout.*"/>
</group>
<group title="Area Tree">
<package name="org.apache.fop.area"/>
1.1 xml-fop/src/java/org/apache/fop/hyphenation/ByteVector.java
Index: ByteVector.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: ByteVector.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.io.Serializable;
/**
* This class implements a simple byte vector with access to the
* underlying array.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class ByteVector implements Serializable {
/**
* Capacity increment size
*/
private static final int DEFAULT_BLOCK_SIZE = 2048;
private int blockSize;
/**
* The encapsulated array
*/
private byte[] array;
/**
* Points to next free item
*/
private int n;
public ByteVector() {
this(DEFAULT_BLOCK_SIZE);
}
public ByteVector(int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = new byte[blockSize];
n = 0;
}
public ByteVector(byte[] a) {
blockSize = DEFAULT_BLOCK_SIZE;
array = a;
n = 0;
}
public ByteVector(byte[] a, int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = a;
n = 0;
}
public byte[] getArray() {
return array;
}
/**
* return number of items in array
*/
public int length() {
return n;
}
/**
* returns current capacity of array
*/
public int capacity() {
return array.length;
}
public void put(int index, byte val) {
array[index] = val;
}
public byte get(int index) {
return array[index];
}
/**
* This is to implement memory allocation in the array. Like malloc().
*/
public int alloc(int size) {
int index = n;
int len = array.length;
if (n + size >= len) {
byte[] aux = new byte[len + blockSize];
System.arraycopy(array, 0, aux, 0, len);
array = aux;
}
n += size;
return index;
}
public void trimToSize() {
if (n < array.length) {
byte[] aux = new byte[n];
System.arraycopy(array, 0, aux, 0, n);
array = aux;
}
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/CharVector.java
Index: CharVector.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: CharVector.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.io.Serializable;
/**
* This class implements a simple char vector with access to the
* underlying array.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class CharVector implements Cloneable, Serializable {
/**
* Capacity increment size
*/
private static final int DEFAULT_BLOCK_SIZE = 2048;
private int blockSize;
/**
* The encapsulated array
*/
private char[] array;
/**
* Points to next free item
*/
private int n;
public CharVector() {
this(DEFAULT_BLOCK_SIZE);
}
public CharVector(int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = new char[blockSize];
n = 0;
}
public CharVector(char[] a) {
blockSize = DEFAULT_BLOCK_SIZE;
array = a;
n = a.length;
}
public CharVector(char[] a, int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = a;
n = a.length;
}
/**
* Reset Vector but don't resize or clear elements
*/
public void clear() {
n = 0;
}
public Object clone() {
CharVector cv = new CharVector((char[])array.clone(), blockSize);
cv.n = this.n;
return cv;
}
public char[] getArray() {
return array;
}
/**
* return number of items in array
*/
public int length() {
return n;
}
/**
* returns current capacity of array
*/
public int capacity() {
return array.length;
}
public void put(int index, char val) {
array[index] = val;
}
public char get(int index) {
return array[index];
}
public int alloc(int size) {
int index = n;
int len = array.length;
if (n + size >= len) {
char[] aux = new char[len + blockSize];
System.arraycopy(array, 0, aux, 0, len);
array = aux;
}
n += size;
return index;
}
public void trimToSize() {
if (n < array.length) {
char[] aux = new char[n];
System.arraycopy(array, 0, aux, 0, n);
array = aux;
}
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/Hyphen.java
Index: Hyphen.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: Hyphen.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.io.Serializable;
/**
* This class represents a hyphen. A 'full' hyphen is made of 3 parts:
* the pre-break text, post-break text and no-break. If no line-break
* is generated at this position, the no-break text is used, otherwise,
* pre-break and post-break are used. Typically, pre-break is equal to
* the hyphen character and the others are empty. However, this general
* scheme allows support for cases in some languages where words change
* spelling if they're split across lines, like german's 'backen' which
* hyphenates 'bak-ken'. BTW, this comes from TeX.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class Hyphen implements Serializable {
public String preBreak;
public String noBreak;
public String postBreak;
Hyphen(String pre, String no, String post) {
preBreak = pre;
noBreak = no;
postBreak = post;
}
Hyphen(String pre) {
preBreak = pre;
noBreak = null;
postBreak = null;
}
public String toString() {
if (noBreak == null
&& postBreak == null
&& preBreak != null
&& preBreak.equals("-")) {
return "-";
}
StringBuffer res = new StringBuffer("{");
res.append(preBreak);
res.append("}{");
res.append(postBreak);
res.append("}{");
res.append(noBreak);
res.append('}');
return res.toString();
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/Hyphenation.java
Index: Hyphenation.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: Hyphenation.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
/**
* This class represents a hyphenated word.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class Hyphenation {
private int[] hyphenPoints;
private String word;
/**
* number of hyphenation points in word
*/
private int len;
/**
* rawWord as made of alternating strings and {@link Hyphen Hyphen}
* instances
*/
Hyphenation(String word, int[] points) {
this.word = word;
hyphenPoints = points;
len = points.length;
}
/**
* @return the number of hyphenation points in the word
*/
public int length() {
return len;
}
/**
* @return the pre-break text, not including the hyphen character
*/
public String getPreHyphenText(int index) {
return word.substring(0, hyphenPoints[index]);
}
/**
* @return the post-break text
*/
public String getPostHyphenText(int index) {
return word.substring(hyphenPoints[index]);
}
/**
* @return the hyphenation points
*/
public int[] getHyphenationPoints() {
return hyphenPoints;
}
public String toString() {
StringBuffer str = new StringBuffer();
int start = 0;
for (int i = 0; i < len; i++) {
str.append(word.substring(start, hyphenPoints[i]) + "-");
start = hyphenPoints[i];
}
str.append(word.substring(start));
return str.toString();
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/HyphenationException.java
Index: HyphenationException.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: HyphenationException.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
/**
* @author Carlos Villegas <ca...@uniscope.co.jp>
* (todo) Derive from FOPException
*/
public class HyphenationException extends Exception {
/**
* @see java.lang.Throwable#Throwable(String)
*/
public HyphenationException(String msg) {
super(msg);
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/HyphenationTree.java
Index: HyphenationTree.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: HyphenationTree.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
/**
* This tree structure stores the hyphenation patterns in an efficient
* way for fast lookup. It provides the provides the method to
* hyphenate a word.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class HyphenationTree extends TernaryTree
implements PatternConsumer, Serializable {
/**
* value space: stores the inteletter values
*/
protected ByteVector vspace;
/**
* This map stores hyphenation exceptions
*/
protected HashMap stoplist;
/**
* This map stores the character classes
*/
protected TernaryTree classmap;
/**
* Temporary map to store interletter values on pattern loading.
*/
private transient TernaryTree ivalues;
public HyphenationTree() {
stoplist = new HashMap(23); // usually a small table
classmap = new TernaryTree();
vspace = new ByteVector();
vspace.alloc(1); // this reserves index 0, which we don't use
}
/**
* Packs the values by storing them in 4 bits, two values into a byte
* Values range is from 0 to 9. We use zero as terminator,
* so we'll add 1 to the value.
* @param values a string of digits from '0' to '9' representing the
* interletter values.
* @return the index into the vspace array where the packed values
* are stored.
*/
protected int packValues(String values) {
int i, n = values.length();
int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
int offset = vspace.alloc(m);
byte[] va = vspace.getArray();
for (i = 0; i < n; i++) {
int j = i >> 1;
byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
if ((i & 1) == 1) {
va[j + offset] = (byte)(va[j + offset] | v);
} else {
va[j + offset] = (byte)(v << 4); // big endian
}
}
va[m - 1 + offset] = 0; // terminator
return offset;
}
protected String unpackValues(int k) {
StringBuffer buf = new StringBuffer();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char)((v >>> 4) - 1 + '0');
buf.append(c);
c = (char)(v & 0x0f);
if (c == 0) {
break;
}
c = (char)(c - 1 + '0');
buf.append(c);
v = vspace.get(k++);
}
return buf.toString();
}
/**
* Read hyphenation patterns from an XML file.
*/
public void loadPatterns(String filename) throws HyphenationException {
PatternParser pp = new PatternParser(this);
ivalues = new TernaryTree();
pp.parse(filename);
// patterns/values should be now in the tree
// let's optimize a bit
trimToSize();
vspace.trimToSize();
classmap.trimToSize();
// get rid of the auxiliary map
ivalues = null;
}
public String findPattern(String pat) {
int k = super.find(pat);
if (k >= 0) {
return unpackValues(k);
}
return "";
}
/**
* String compare, returns 0 if equal or
* t is a substring of s
*/
protected int hstrcmp(char[] s, int si, char[] t, int ti) {
for (; s[si] == t[ti]; si++, ti++) {
if (s[si] == 0) {
return 0;
}
}
if (t[ti] == 0) {
return 0;
}
return s[si] - t[ti];
}
protected byte[] getValues(int k) {
StringBuffer buf = new StringBuffer();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char)((v >>> 4) - 1);
buf.append(c);
c = (char)(v & 0x0f);
if (c == 0) {
break;
}
c = (char)(c - 1);
buf.append(c);
v = vspace.get(k++);
}
byte[] res = new byte[buf.length()];
for (int i = 0; i < res.length; i++) {
res[i] = (byte)buf.charAt(i);
}
return res;
}
/**
* <p>Search for all possible partial matches of word starting
* at index an update interletter values. In other words, it
* does something like:</p>
* <code>
* for(i=0; i<patterns.length; i++) {
* if ( word.substring(index).startsWidth(patterns[i]) )
* update_interletter_values(patterns[i]);
* }
* </code>
* <p>But it is done in an efficient way since the patterns are
* stored in a ternary tree. In fact, this is the whole purpose
* of having the tree: doing this search without having to test
* every single pattern. The number of patterns for languages
* such as English range from 4000 to 10000. Thus, doing thousands
* of string comparisons for each word to hyphenate would be
* really slow without the tree. The tradeoff is memory, but
* using a ternary tree instead of a trie, almost halves the
* the memory used by Lout or TeX. It's also faster than using
* a hash table</p>
* @param word null terminated word to match
* @param index start index from word
* @param il interletter values array to update
*/
protected void searchPatterns(char[] word, int index, byte[] il) {
byte[] values;
int i = index;
char p, q;
char sp = word[i];
p = root;
while (p > 0 && p < sc.length) {
if (sc[p] == 0xFFFF) {
if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
values = getValues(eq[p]); // data pointer is in eq[]
int j = index;
for (int k = 0; k < values.length; k++) {
if (j < il.length && values[k] > il[j]) {
il[j] = values[k];
}
j++;
}
}
return;
}
int d = sp - sc[p];
if (d == 0) {
if (sp == 0) {
break;
}
sp = word[++i];
p = eq[p];
q = p;
// look for a pattern ending at this position by searching for
// the null char ( splitchar == 0 )
while (q > 0 && q < sc.length) {
if (sc[q] == 0xFFFF) { // stop at compressed branch
break;
}
if (sc[q] == 0) {
values = getValues(eq[q]);
int j = index;
for (int k = 0; k < values.length; k++) {
if (j < il.length && values[k] > il[j]) {
il[j] = values[k];
}
j++;
}
break;
} else {
q = lo[q];
/**
* actually the code should be:
* q = sc[q] < 0 ? hi[q] : lo[q];
* but java chars are unsigned
*/
}
}
} else {
p = d < 0 ? lo[p] : hi[p];
}
}
}
/**
* Hyphenate word and return a Hyphenation object.
* @param word the word to be hyphenated
* @param remainCharCount Minimum number of characters allowed
* before the hyphenation point.
* @param pushCharCount Minimum number of characters allowed after
* the hyphenation point.
* @return a {@link Hyphenation Hyphenation} object representing
* the hyphenated word or null if word is not hyphenated.
*/
public Hyphenation hyphenate(String word, int remainCharCount,
int pushCharCount) {
char[] w = word.toCharArray();
return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
}
/**
* w = "****nnllllllnnn*****",
* where n is a non-letter, l is a letter,
* all n may be absent, the first n is at offset,
* the first l is at offset + iIgnoreAtBeginning;
* word = ".llllll.'\0'***",
* where all l in w are copied into word.
* In the first part of the routine len = w.length,
* in the second part of the routine len = word.length.
* Three indices are used:
* index(w), the index in w,
* index(word), the index in word,
* letterindex(word), the index in the letter part of word.
* The following relations exist:
* index(w) = offset + i - 1
* index(word) = i - iIgnoreAtBeginning
* letterindex(word) = index(word) - 1
* (see first loop).
* It follows that:
* index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
* index(w) = letterindex(word) + offset + iIgnoreAtBeginning
*/
/**
* Hyphenate word and return an array of hyphenation points.
* @param w char array that contains the word
* @param offset Offset to first character in word
* @param len Length of word
* @param remainCharCount Minimum number of characters allowed
* before the hyphenation point.
* @param pushCharCount Minimum number of characters allowed after
* the hyphenation point.
* @return a {@link Hyphenation Hyphenation} object representing
* the hyphenated word or null if word is not hyphenated.
*/
public Hyphenation hyphenate(char[] w, int offset, int len,
int remainCharCount, int pushCharCount) {
int i;
char[] word = new char[len + 3];
// normalize word
char[] c = new char[2];
int iIgnoreAtBeginning = 0;
int iLength = len;
boolean bEndOfLetters = false;
for (i = 1; i <= len; i++) {
c[0] = w[offset + i - 1];
int nc = classmap.find(c, 0);
if (nc < 0) { // found a non-letter character ...
if (i == (1 + iIgnoreAtBeginning)) {
// ... before any letter character
iIgnoreAtBeginning ++;
} else {
// ... after a letter character
bEndOfLetters = true;
}
iLength --;
} else {
if (!bEndOfLetters) {
word[i - iIgnoreAtBeginning] = (char)nc;
} else {
return null;
}
}
}
len = iLength;
if (len < (remainCharCount + pushCharCount)) {
// word is too short to be hyphenated
return null;
}
int[] result = new int[len + 1];
int k = 0;
// check exception list first
String sw = new String(word, 1, len);
if (stoplist.containsKey(sw)) {
// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
ArrayList hw = (ArrayList)stoplist.get(sw);
int j = 0;
for (i = 0; i < hw.size(); i++) {
Object o = hw.get(i);
// j = index(sw) = letterindex(word)?
// result[k] = corresponding index(w)
if (o instanceof String) {
j += ((String)o).length();
if (j >= remainCharCount && j < (len - pushCharCount)) {
result[k++] = j + iIgnoreAtBeginning;
}
}
}
} else {
// use algorithm to get hyphenation points
word[0] = '.'; // word start marker
word[len + 1] = '.'; // word end marker
word[len + 2] = 0; // null terminated
byte[] il = new byte[len + 3]; // initialized to zero
for (i = 0; i < len + 1; i++) {
searchPatterns(word, i, il);
}
// hyphenation points are located where interletter value is odd
// i is letterindex(word),
// i + 1 is index(word),
// result[k] = corresponding index(w)
for (i = 0; i < len; i++) {
if (((il[i + 1] & 1) == 1) && i >= remainCharCount
&& i <= (len - pushCharCount)) {
result[k++] = i + iIgnoreAtBeginning;
}
}
}
if (k > 0) {
// trim result array
int[] res = new int[k];
System.arraycopy(result, 0, res, 0, k);
return new Hyphenation(new String(w, offset, len), res);
} else {
return null;
}
}
/**
* Add a character class to the tree. It is used by
* {@link PatternParser PatternParser} as callback to
* add character classes. Character classes define the
* valid word characters for hyphenation. If a word contains
* a character not defined in any of the classes, it is not hyphenated.
* It also defines a way to normalize the characters in order
* to compare them with the stored patterns. Usually pattern
* files use only lower case characters, in this case a class
* for letter 'a', for example, should be defined as "aA", the first
* character being the normalization char.
*/
public void addClass(String chargroup) {
if (chargroup.length() > 0) {
char equivChar = chargroup.charAt(0);
char[] key = new char[2];
key[1] = 0;
for (int i = 0; i < chargroup.length(); i++) {
key[0] = chargroup.charAt(i);
classmap.insert(key, 0, equivChar);
}
}
}
/**
* Add an exception to the tree. It is used by
* {@link PatternParser PatternParser} class as callback to
* store the hyphenation exceptions.
* @param word normalized word
* @param hyphenatedword a vector of alternating strings and
* {@link Hyphen hyphen} objects.
*/
public void addException(String word, ArrayList hyphenatedword) {
stoplist.put(word, hyphenatedword);
}
/**
* Add a pattern to the tree. Mainly, to be used by
* {@link PatternParser PatternParser} class as callback to
* add a pattern to the tree.
* @param pattern the hyphenation pattern
* @param ivalue interletter weight values indicating the
* desirability and priority of hyphenating at a given point
* within the pattern. It should contain only digit characters.
* (i.e. '0' to '9').
*/
public void addPattern(String pattern, String ivalue) {
int k = ivalues.find(ivalue);
if (k <= 0) {
k = packValues(ivalue);
ivalues.insert(ivalue, (char)k);
}
insert(pattern, (char)k);
}
public void printStats() {
System.out.println("Value space size = "
+ Integer.toString(vspace.length()));
super.printStats();
}
public static void main(String[] argv) throws Exception {
HyphenationTree ht = null;
int minCharCount = 2;
BufferedReader in =
new BufferedReader(new java.io.InputStreamReader(System.in));
while (true) {
System.out.print("l:\tload patterns from XML\n"
+ "L:\tload patterns from serialized object\n"
+ "s:\tset minimun character count\n"
+ "w:\twrite hyphenation tree to object file\n"
+ "h:\thyphenate\n"
+ "f:\tfind pattern\n"
+ "b:\tbenchmark\n"
+ "q:\tquit\n\n"
+ "Command:");
String token = in.readLine().trim();
if (token.equals("f")) {
System.out.print("Pattern: ");
token = in.readLine().trim();
System.out.println("Values: " + ht.findPattern(token));
} else if (token.equals("s")) {
System.out.print("Minimun value: ");
token = in.readLine().trim();
minCharCount = Integer.parseInt(token);
} else if (token.equals("l")) {
ht = new HyphenationTree();
System.out.print("XML file name: ");
token = in.readLine().trim();
ht.loadPatterns(token);
} else if (token.equals("L")) {
ObjectInputStream ois = null;
System.out.print("Object file name: ");
token = in.readLine().trim();
try {
ois = new ObjectInputStream(new FileInputStream(token));
ht = (HyphenationTree)ois.readObject();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (ois != null) {
try {
ois.close();
} catch (IOException e) {
//ignore
}
}
}
} else if (token.equals("w")) {
System.out.print("Object file name: ");
token = in.readLine().trim();
ObjectOutputStream oos = null;
try {
oos = new ObjectOutputStream(new FileOutputStream(token));
oos.writeObject(ht);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (oos != null) {
try {
oos.flush();
} catch (IOException e) {
//ignore
}
try {
oos.close();
} catch (IOException e) {
//ignore
}
}
}
} else if (token.equals("h")) {
System.out.print("Word: ");
token = in.readLine().trim();
System.out.print("Hyphenation points: ");
System.out.println(ht.hyphenate(token, minCharCount,
minCharCount));
} else if (token.equals("b")) {
if (ht == null) {
System.out.println("No patterns has been loaded.");
break;
}
System.out.print("Word list filename: ");
token = in.readLine().trim();
long starttime = 0;
int counter = 0;
try {
BufferedReader reader =
new BufferedReader(new FileReader(token));
String line;
starttime = System.currentTimeMillis();
while ((line = reader.readLine()) != null) {
// System.out.print("\nline: ");
Hyphenation hyp = ht.hyphenate(line, minCharCount,
minCharCount);
if (hyp != null) {
String hword = hyp.toString();
// System.out.println(line);
// System.out.println(hword);
} else {
// System.out.println("No hyphenation");
}
counter++;
}
} catch (Exception ioe) {
System.out.println("Exception " + ioe);
ioe.printStackTrace();
}
long endtime = System.currentTimeMillis();
long result = endtime - starttime;
System.out.println(counter + " words in " + result
+ " Millisekunden hyphenated");
} else if (token.equals("q")) {
break;
}
}
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/Hyphenator.java
Index: Hyphenator.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: Hyphenator.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.Hashtable;
/**
* This class is the main entry point to the hyphenation package.
* You can use only the static methods or create an instance.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class Hyphenator {
/**@todo Don't use statics */
private static Hashtable hyphenTrees = new Hashtable();
private HyphenationTree hyphenTree = null;
private int remainCharCount = 2;
private int pushCharCount = 2;
private static boolean errorDump = false;
public Hyphenator(String lang, String country, int leftMin,
int rightMin) {
hyphenTree = getHyphenationTree(lang, country);
remainCharCount = leftMin;
pushCharCount = rightMin;
}
public static HyphenationTree getHyphenationTree(String lang,
String country) {
String key = lang;
// check whether the country code has been used
if (country != null && !country.equals("none")) {
key += "_" + country;
}
// first try to find it in the cache
if (hyphenTrees.containsKey(key)) {
return (HyphenationTree)hyphenTrees.get(key);
}
if (hyphenTrees.containsKey(lang)) {
return (HyphenationTree)hyphenTrees.get(lang);
}
HyphenationTree hTree = getFopHyphenationTree(key);
if (hTree == null) {
String hyphenDir = "/hyph";
if (hyphenDir != null) {
hTree = getUserHyphenationTree(key, hyphenDir);
}
}
// put it into the pattern cache
if (hTree != null) {
hyphenTrees.put(key, hTree);
} else {
/**@todo Proper logging please */
//log.error("Couldn't find hyphenation pattern "
// + key);
}
return hTree;
}
private static InputStream getResourceStream(String key) {
InputStream is = null;
// Try to use Context Class Loader to load the properties file.
try {
java.lang.reflect.Method getCCL =
Thread.class.getMethod("getContextClassLoader", new Class[0]);
if (getCCL != null) {
ClassLoader contextClassLoader =
(ClassLoader)getCCL.invoke(Thread.currentThread(),
new Object[0]);
is = contextClassLoader.getResourceAsStream("hyph/" + key
+ ".hyp");
}
} catch (Exception e) {
//ignore, fallback further down
}
if (is == null) {
is = Hyphenator.class.getResourceAsStream("/hyph/" + key
+ ".hyp");
}
return is;
}
public static HyphenationTree getFopHyphenationTree(String key) {
HyphenationTree hTree = null;
ObjectInputStream ois = null;
InputStream is = null;
try {
is = getResourceStream(key);
if (is == null) {
if (key.length() == 5) {
is = getResourceStream(key.substring(0, 2));
if (is != null) {
//log.error("Couldn't find hyphenation pattern "
// + key
// + "\nusing general language pattern "
// + key.substring(0, 2)
// + " instead.");
} else {
if (errorDump) {
//log.error("Couldn't find precompiled "
// + "fop hyphenation pattern "
// + key + ".hyp");
}
return null;
}
} else {
if (errorDump) {
//log.error("Couldn't find precompiled "
// + "fop hyphenation pattern "
// + key + ".hyp");
}
return null;
}
}
ois = new ObjectInputStream(is);
hTree = (HyphenationTree)ois.readObject();
} catch (Exception e) {
/**@todo proper logging please */
e.printStackTrace();
} finally {
if (ois != null) {
try {
ois.close();
} catch (IOException e) {
//log.error("can't close hyphenation object stream");
}
}
}
return hTree;
}
/**
* load tree from serialized file or xml file
* using configuration settings
*/
public static HyphenationTree getUserHyphenationTree(String key,
String hyphenDir) {
HyphenationTree hTree = null;
// I use here the following convention. The file name specified in
// the configuration is taken as the base name. First we try
// name + ".hyp" assuming a serialized HyphenationTree. If that fails
// we try name + ".xml", assumming a raw hyphenation pattern file.
// first try serialized object
File hyphenFile = new File(hyphenDir, key + ".hyp");
if (hyphenFile.exists()) {
ObjectInputStream ois = null;
try {
ois = new ObjectInputStream(new BufferedInputStream(
new FileInputStream(hyphenFile)));
hTree = (HyphenationTree)ois.readObject();
} catch (Exception e) {
/**@todo Proper logging please */
e.printStackTrace();
} finally {
if (ois != null) {
try {
ois.close();
} catch (IOException e) {
//ignore
}
}
}
return hTree;
} else {
// try the raw XML file
hyphenFile = new File(hyphenDir, key + ".xml");
if (hyphenFile.exists()) {
hTree = new HyphenationTree();
if (errorDump) {
//log.error("reading " + hyphenDir + key
// + ".xml");
}
try {
hTree.loadPatterns(hyphenFile.getPath());
if (errorDump) {
System.out.println("Stats: ");
hTree.printStats();
}
return hTree;
} catch (HyphenationException ex) {
if (errorDump) {
//log.error("Can't load user patterns "
// + "from xml file " + hyphenDir
// + key + ".xml");
}
return null;
}
} else {
if (errorDump) {
//log.error("Tried to load "
// + hyphenFile.toString()
// + "\nCannot find compiled nor xml file for "
// + "hyphenation pattern" + key);
}
return null;
}
}
}
public static Hyphenation hyphenate(String lang, String country,
String word, int leftMin,
int rightMin) {
HyphenationTree hTree = getHyphenationTree(lang, country);
if (hTree == null) {
//log.error("Error building hyphenation tree for language "
// + lang);
return null;
}
return hTree.hyphenate(word, leftMin, rightMin);
}
public static Hyphenation hyphenate(String lang, String country,
char[] word, int offset, int len,
int leftMin, int rightMin) {
HyphenationTree hTree = getHyphenationTree(lang, country);
if (hTree == null) {
//log.error("Error building hyphenation tree for language "
// + lang);
return null;
}
return hTree.hyphenate(word, offset, len, leftMin, rightMin);
}
public void setMinRemainCharCount(int min) {
remainCharCount = min;
}
public void setMinPushCharCount(int min) {
pushCharCount = min;
}
public void setLanguage(String lang, String country) {
hyphenTree = getHyphenationTree(lang, country);
}
public Hyphenation hyphenate(char[] word, int offset, int len) {
if (hyphenTree == null) {
return null;
}
return hyphenTree.hyphenate(word, offset, len, remainCharCount,
pushCharCount);
}
public Hyphenation hyphenate(String word) {
if (hyphenTree == null) {
return null;
}
return hyphenTree.hyphenate(word, remainCharCount, pushCharCount);
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/PatternConsumer.java
Index: PatternConsumer.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: PatternConsumer.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.util.ArrayList;
/**
* This interface is used to connect the XML pattern file parser to
* the hyphenation tree.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public interface PatternConsumer {
/**
* Add a character class.
* A character class defines characters that are considered
* equivalent for the purpose of hyphenation (e.g. "aA"). It
* usually means to ignore case.
* @param chargroup character group
*/
void addClass(String chargroup);
/**
* Add a hyphenation exception. An exception replaces the
* result obtained by the algorithm for cases for which this
* fails or the user wants to provide his own hyphenation.
* A hyphenatedword is a vector of alternating String's and
* {@link Hyphen Hyphen} instances
*/
void addException(String word, ArrayList hyphenatedword);
/**
* Add hyphenation patterns.
* @param pattern the pattern
* @param values interletter values expressed as a string of
* digit characters.
*/
void addPattern(String pattern, String values);
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/PatternParser.java
Index: PatternParser.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: PatternParser.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
// SAX
import org.xml.sax.XMLReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
// Java
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.net.URL;
/**
* A SAX document handler to read and parse hyphenation patterns
* from a XML file.
*
* @author Carlos Villegas <ca...@uniscope.co.jp>
*/
public class PatternParser extends DefaultHandler implements PatternConsumer {
XMLReader parser;
int currElement;
PatternConsumer consumer;
StringBuffer token;
ArrayList exception;
char hyphenChar;
String errMsg;
static final int ELEM_CLASSES = 1;
static final int ELEM_EXCEPTIONS = 2;
static final int ELEM_PATTERNS = 3;
static final int ELEM_HYPHEN = 4;
public PatternParser() throws HyphenationException {
token = new StringBuffer();
parser = createParser();
parser.setContentHandler(this);
parser.setErrorHandler(this);
hyphenChar = '-'; // default
}
public PatternParser(PatternConsumer consumer)
throws HyphenationException {
this();
this.consumer = consumer;
}
public void setConsumer(PatternConsumer consumer) {
this.consumer = consumer;
}
public void parse(String filename) throws HyphenationException {
InputSource uri = fileInputSource(filename);
try {
parser.parse(uri);
} catch (SAXException e) {
throw new HyphenationException(errMsg);
} catch (IOException e) {
throw new HyphenationException(e.getMessage());
} catch (NullPointerException e) {
throw new HyphenationException("SAX parser not available");
}
}
/**
* creates a SAX parser, using the value of org.xml.sax.parser
* defaulting to org.apache.xerces.parsers.SAXParser
*
* @return the created SAX parser
*/
static XMLReader createParser() throws HyphenationException {
String parserClassName = System.getProperty("org.xml.sax.parser");
if (parserClassName == null) {
parserClassName = "org.apache.xerces.parsers.SAXParser";
}
// System.out.println("using SAX parser " + parserClassName);
try {
return (XMLReader)Class.forName(parserClassName).newInstance();
} catch (ClassNotFoundException e) {
throw new HyphenationException("Could not find "
+ parserClassName);
} catch (InstantiationException e) {
throw new HyphenationException("Could not instantiate "
+ parserClassName);
} catch (IllegalAccessException e) {
throw new HyphenationException("Could not access "
+ parserClassName);
} catch (ClassCastException e) {
throw new HyphenationException(parserClassName
+ " is not a SAX driver");
}
}
/**
* create an InputSource from a file name
*
* @param filename the name of the file
* @return the InputSource created
*/
protected static InputSource fileInputSource(String filename)
throws HyphenationException {
/* this code adapted from James Clark's in XT */
File file = new File(filename);
String path = file.getAbsolutePath();
String fSep = System.getProperty("file.separator");
if (fSep != null && fSep.length() == 1) {
path = path.replace(fSep.charAt(0), '/');
}
if (path.length() > 0 && path.charAt(0) != '/') {
path = '/' + path;
}
try {
return new InputSource(new URL("file", null, path).toString());
} catch (java.net.MalformedURLException e) {
throw new HyphenationException("unexpected MalformedURLException");
}
}
protected String readToken(StringBuffer chars) {
String word;
boolean space = false;
int i;
for (i = 0; i < chars.length(); i++) {
if (Character.isWhitespace(chars.charAt(i))) {
space = true;
} else {
break;
}
}
if (space) {
// chars.delete(0,i);
for (int countr = i; countr < chars.length(); countr++) {
chars.setCharAt(countr - i, chars.charAt(countr));
}
chars.setLength(chars.length() - i);
if (token.length() > 0) {
word = token.toString();
token.setLength(0);
return word;
}
}
space = false;
for (i = 0; i < chars.length(); i++) {
if (Character.isWhitespace(chars.charAt(i))) {
space = true;
break;
}
}
token.append(chars.toString().substring(0, i));
// chars.delete(0,i);
for (int countr = i; countr < chars.length(); countr++) {
chars.setCharAt(countr - i, chars.charAt(countr));
}
chars.setLength(chars.length() - i);
if (space) {
word = token.toString();
token.setLength(0);
return word;
}
token.append(chars);
return null;
}
protected static String getPattern(String word) {
StringBuffer pat = new StringBuffer();
int len = word.length();
for (int i = 0; i < len; i++) {
if (!Character.isDigit(word.charAt(i))) {
pat.append(word.charAt(i));
}
}
return pat.toString();
}
protected ArrayList normalizeException(ArrayList ex) {
ArrayList res = new ArrayList();
for (int i = 0; i < ex.size(); i++) {
Object item = ex.get(i);
if (item instanceof String) {
String str = (String)item;
StringBuffer buf = new StringBuffer();
for (int j = 0; j < str.length(); j++) {
char c = str.charAt(j);
if (c != hyphenChar) {
buf.append(c);
} else {
res.add(buf.toString());
buf.setLength(0);
char[] h = new char[1];
h[0] = hyphenChar;
// we use here hyphenChar which is not necessarily
// the one to be printed
res.add(new Hyphen(new String(h), null, null));
}
}
if (buf.length() > 0) {
res.add(buf.toString());
}
} else {
res.add(item);
}
}
return res;
}
protected String getExceptionWord(ArrayList ex) {
StringBuffer res = new StringBuffer();
for (int i = 0; i < ex.size(); i++) {
Object item = ex.get(i);
if (item instanceof String) {
res.append((String)item);
} else {
if (((Hyphen)item).noBreak != null) {
res.append(((Hyphen)item).noBreak);
}
}
}
return res.toString();
}
protected static String getInterletterValues(String pat) {
StringBuffer il = new StringBuffer();
String word = pat + "a"; // add dummy letter to serve as sentinel
int len = word.length();
for (int i = 0; i < len; i++) {
char c = word.charAt(i);
if (Character.isDigit(c)) {
il.append(c);
i++;
} else {
il.append('0');
}
}
return il.toString();
}
//
// DocumentHandler methods
//
/**
* Start element.
*/
public void startElement(String uri, String local, String raw,
Attributes attrs) {
if (local.equals("hyphen-char")) {
String h = attrs.getValue("value");
if (h != null && h.length() == 1) {
hyphenChar = h.charAt(0);
}
} else if (local.equals("classes")) {
currElement = ELEM_CLASSES;
} else if (local.equals("patterns")) {
currElement = ELEM_PATTERNS;
} else if (local.equals("exceptions")) {
currElement = ELEM_EXCEPTIONS;
exception = new ArrayList();
} else if (local.equals("hyphen")) {
if (token.length() > 0) {
exception.add(token.toString());
}
exception.add(new Hyphen(attrs.getValue("pre"),
attrs.getValue("no"),
attrs.getValue("post")));
currElement = ELEM_HYPHEN;
}
token.setLength(0);
}
public void endElement(String uri, String local, String raw) {
if (token.length() > 0) {
String word = token.toString();
switch (currElement) {
case ELEM_CLASSES:
consumer.addClass(word);
break;
case ELEM_EXCEPTIONS:
exception.add(word);
exception = normalizeException(exception);
consumer.addException(getExceptionWord(exception),
(ArrayList)exception.clone());
break;
case ELEM_PATTERNS:
consumer.addPattern(getPattern(word),
getInterletterValues(word));
break;
case ELEM_HYPHEN:
// nothing to do
break;
}
if (currElement != ELEM_HYPHEN) {
token.setLength(0);
}
}
if (currElement == ELEM_HYPHEN) {
currElement = ELEM_EXCEPTIONS;
} else {
currElement = 0;
}
}
/**
* Characters.
*/
public void characters(char ch[], int start, int length) {
StringBuffer chars = new StringBuffer(length);
chars.append(ch, start, length);
String word = readToken(chars);
while (word != null) {
// System.out.println("\"" + word + "\"");
switch (currElement) {
case ELEM_CLASSES:
consumer.addClass(word);
break;
case ELEM_EXCEPTIONS:
exception.add(word);
exception = normalizeException(exception);
consumer.addException(getExceptionWord(exception),
(ArrayList)exception.clone());
exception.clear();
break;
case ELEM_PATTERNS:
consumer.addPattern(getPattern(word),
getInterletterValues(word));
break;
}
word = readToken(chars);
}
}
//
// ErrorHandler methods
//
/**
* Warning.
*/
public void warning(SAXParseException ex) {
errMsg = "[Warning] " + getLocationString(ex) + ": "
+ ex.getMessage();
}
/**
* Error.
*/
public void error(SAXParseException ex) {
errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
}
/**
* Fatal error.
*/
public void fatalError(SAXParseException ex) throws SAXException {
errMsg = "[Fatal Error] " + getLocationString(ex) + ": "
+ ex.getMessage();
throw ex;
}
/**
* Returns a string of the location.
*/
private String getLocationString(SAXParseException ex) {
StringBuffer str = new StringBuffer();
String systemId = ex.getSystemId();
if (systemId != null) {
int index = systemId.lastIndexOf('/');
if (index != -1) {
systemId = systemId.substring(index + 1);
}
str.append(systemId);
}
str.append(':');
str.append(ex.getLineNumber());
str.append(':');
str.append(ex.getColumnNumber());
return str.toString();
} // getLocationString(SAXParseException):String
// PatternConsumer implementation for testing purposes
public void addClass(String c) {
System.out.println("class: " + c);
}
public void addException(String w, ArrayList e) {
System.out.println("exception: " + w + " : " + e.toString());
}
public void addPattern(String p, String v) {
System.out.println("pattern: " + p + " : " + v);
}
public static void main(String[] args) throws Exception {
if (args.length > 0) {
PatternParser pp = new PatternParser();
pp.setConsumer(pp);
pp.parse(args[0]);
}
}
}
1.1 xml-fop/src/java/org/apache/fop/hyphenation/TernaryTree.java
Index: TernaryTree.java
===================================================================
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: TernaryTree.java,v 1.1 2004/09/06 18:03:12 gmazza Exp $ */
package org.apache.fop.hyphenation;
import java.util.Enumeration;
import java.util.Stack;
import java.io.Serializable;
/**
* <h2>Ternary Search Tree.</h2>
*
* <p>A ternary search tree is a hibrid between a binary tree and
* a digital search tree (trie). Keys are limited to strings.
* A data value of type char is stored in each leaf node.
* It can be used as an index (or pointer) to the data.
* Branches that only contain one key are compressed to one node
* by storing a pointer to the trailer substring of the key.
* This class is intended to serve as base class or helper class
* to implement Dictionary collections or the like. Ternary trees
* have some nice properties as the following: the tree can be
* traversed in sorted order, partial matches (wildcard) can be
* implemented, retrieval of all keys within a given distance
* from the target, etc. The storage requirements are higher than
* a binary tree but a lot less than a trie. Performance is
* comparable with a hash table, sometimes it outperforms a hash
* function (most of the time can determine a miss faster than a hash).</p>
*
* <p>The main purpose of this java port is to serve as a base for
* implementing TeX's hyphenation algorithm (see The TeXBook,
* appendix H). Each language requires from 5000 to 15000 hyphenation
* patterns which will be keys in this tree. The strings patterns
* are usually small (from 2 to 5 characters), but each char in the
* tree is stored in a node. Thus memory usage is the main concern.
* We will sacrify 'elegance' to keep memory requirenments to the
* minimum. Using java's char type as pointer (yes, I know pointer
* it is a forbidden word in java) we can keep the size of the node
* to be just 8 bytes (3 pointers and the data char). This gives
* room for about 65000 nodes. In my tests the english patterns
* took 7694 nodes and the german patterns 10055 nodes,
* so I think we are safe.</p>
*
* <p>All said, this is a map with strings as keys and char as value.
* Pretty limited!. It can be extended to a general map by
* using the string representation of an object and using the
* char value as an index to an array that contains the object
* values.</p>
*
* @author cav@uniscope.co.jp
*/
public class TernaryTree implements Cloneable, Serializable {
/**
* We use 4 arrays to represent a node. I guess I should have created
* a proper node class, but somehow Knuth's pascal code made me forget
* we now have a portable language with virtual memory management and
* automatic garbage collection! And now is kind of late, furthermore,
* if it ain't broken, don't fix it.
*/
/**
* Pointer to low branch and to rest of the key when it is
* stored directly in this node, we don't have unions in java!
*/
protected char[] lo;
/**
* Pointer to high branch.
*/
protected char[] hi;
/**
* Pointer to equal branch and to data when this node is a string terminator.
*/
protected char[] eq;
/**
* <P>The character stored in this node: splitchar.
* Two special values are reserved:</P>
* <ul><li>0x0000 as string terminator</li>
* <li>0xFFFF to indicate that the branch starting at
* this node is compressed</li></ul>
* <p>This shouldn't be a problem if we give the usual semantics to
* strings since 0xFFFF is garanteed not to be an Unicode character.</p>
*/
protected char[] sc;
/**
* This vector holds the trailing of the keys when the branch is compressed.
*/
protected CharVector kv;
protected char root;
protected char freenode;
protected int length; // number of items in tree
protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
TernaryTree() {
init();
}
protected void init() {
root = 0;
freenode = 1;
length = 0;
lo = new char[BLOCK_SIZE];
hi = new char[BLOCK_SIZE];
eq = new char[BLOCK_SIZE];
sc = new char[BLOCK_SIZE];
kv = new CharVector();
}
/**
* Branches are initially compressed, needing
* one node per key plus the size of the string
* key. They are decompressed as needed when
* another key with same prefix
* is inserted. This saves a lot of space,
* specially for long keys.
*/
public void insert(String key, char val) {
// make sure we have enough room in the arrays
int len = key.length()
+ 1; // maximum number of nodes that may be generated
if (freenode + len > eq.length) {
redimNodeArrays(eq.length + BLOCK_SIZE);
}
char strkey[] = new char[len--];
key.getChars(0, len, strkey, 0);
strkey[len] = 0;
root = insert(root, strkey, 0, val);
}
public void insert(char[] key, int start, char val) {
int len = strlen(key) + 1;
if (freenode + len > eq.length) {
redimNodeArrays(eq.length + BLOCK_SIZE);
}
root = insert(root, key, start, val);
}
/**
* The actual insertion function, recursive version.
*/
private char insert(char p, char[] key, int start, char val) {
int len = strlen(key, start);
if (p == 0) {
// this means there is no branch, this node will start a new branch.
// Instead of doing that, we store the key somewhere else and create
// only one node with a pointer to the key
p = freenode++;
eq[p] = val; // holds data
length++;
hi[p] = 0;
if (len > 0) {
sc[p] = 0xFFFF; // indicates branch is compressed
lo[p] = (char)kv.alloc(len
+ 1); // use 'lo' to hold pointer to key
strcpy(kv.getArray(), lo[p], key, start);
} else {
sc[p] = 0;
lo[p] = 0;
}
return p;
}
if (sc[p] == 0xFFFF) {
// branch is compressed: need to decompress
// this will generate garbage in the external key array
// but we can do some garbage collection later
char pp = freenode++;
lo[pp] = lo[p]; // previous pointer to key
eq[pp] = eq[p]; // previous pointer to data
lo[p] = 0;
if (len > 0) {
sc[p] = kv.get(lo[pp]);
eq[p] = pp;
lo[pp]++;
if (kv.get(lo[pp]) == 0) {
// key completly decompressed leaving garbage in key array
lo[pp] = 0;
sc[pp] = 0;
hi[pp] = 0;
} else {
// we only got first char of key, rest is still there
sc[pp] = 0xFFFF;
}
} else {
// In this case we can save a node by swapping the new node
// with the compressed node
sc[pp] = 0xFFFF;
hi[p] = pp;
sc[p] = 0;
eq[p] = val;
length++;
return p;
}
}
char s = key[start];
if (s < sc[p]) {
lo[p] = insert(lo[p], key, start, val);
} else if (s == sc[p]) {
if (s != 0) {
eq[p] = insert(eq[p], key, start + 1, val);
} else {
// key already in tree, overwrite data
eq[p] = val;
}
} else {
hi[p] = insert(hi[p], key, start, val);
}
return p;
}
/**
* Compares 2 null terminated char arrays
*/
public static int strcmp(char[] a, int startA, char[] b, int startB) {
for (; a[startA] == b[startB]; startA++, startB++) {
if (a[startA] == 0) {
return 0;
}
}
return a[startA] - b[startB];
}
/**
* Compares a string with null terminated char array
*/
public static int strcmp(String str, char[] a, int start) {
int i, d, len = str.length();
for (i = 0; i < len; i++) {
d = (int)str.charAt(i) - a[start + i];
if (d != 0) {
return d;
}
if (a[start + i] == 0) {
return d;
}
}
if (a[start + i] != 0) {
return (int)-a[start + i];
}
return 0;
}
public static void strcpy(char[] dst, int di, char[] src, int si) {
while (src[si] != 0) {
dst[di++] = src[si++];
}
dst[di] = 0;
}
public static int strlen(char[] a, int start) {
int len = 0;
for (int i = start; i < a.length && a[i] != 0; i++) {
len++;
}
return len;
}
public static int strlen(char[] a) {
return strlen(a, 0);
}
public int find(String key) {
int len = key.length();
char strkey[] = new char[len + 1];
key.getChars(0, len, strkey, 0);
strkey[len] = 0;
return find(strkey, 0);
}
public int find(char[] key, int start) {
int d;
char p = root;
int i = start;
char c;
while (p != 0) {
if (sc[p] == 0xFFFF) {
if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
return eq[p];
} else {
return -1;
}
}
c = key[i];
d = c - sc[p];
if (d == 0) {
if (c == 0) {
return eq[p];
}
i++;
p = eq[p];
} else if (d < 0) {
p = lo[p];
} else {
p = hi[p];
}
}
return -1;
}
public boolean knows(String key) {
return (find(key) >= 0);
}
// redimension the arrays
private void redimNodeArrays(int newsize) {
int len = newsize < lo.length ? newsize : lo.length;
char[] na = new char[newsize];
System.arraycopy(lo, 0, na, 0, len);
lo = na;
na = new char[newsize];
System.arraycopy(hi, 0, na, 0, len);
hi = na;
na = new char[newsize];
System.arraycopy(eq, 0, na, 0, len);
eq = na;
na = new char[newsize];
System.arraycopy(sc, 0, na, 0, len);
sc = na;
}
public int size() {
return length;
}
public Object clone() {
TernaryTree t = new TernaryTree();
t.lo = (char[])this.lo.clone();
t.hi = (char[])this.hi.clone();
t.eq = (char[])this.eq.clone();
t.sc = (char[])this.sc.clone();
t.kv = (CharVector)this.kv.clone();
t.root = this.root;
t.freenode = this.freenode;
t.length = this.length;
return t;
}
/**
* Recursively insert the median first and then the median of the
* lower and upper halves, and so on in order to get a balanced
* tree. The array of keys is assumed to be sorted in ascending
* order.
*/
protected void insertBalanced(String[] k, char[] v, int offset, int n) {
int m;
if (n < 1) {
return;
}
m = n >> 1;
insert(k[m + offset], v[m + offset]);
insertBalanced(k, v, offset, m);
insertBalanced(k, v, offset + m + 1, n - m - 1);
}
/**
* Balance the tree for best search performance
*/
public void balance() {
// System.out.print("Before root splitchar = "); System.out.println(sc[root]);
int i = 0, n = length;
String[] k = new String[n];
char[] v = new char[n];
Iterator iter = new Iterator();
while (iter.hasMoreElements()) {
v[i] = iter.getValue();
k[i++] = (String)iter.nextElement();
}
init();
insertBalanced(k, v, 0, n);
// With uniform letter distribution sc[root] should be around 'm'
// System.out.print("After root splitchar = "); System.out.println(sc[root]);
}
/**
* Each node stores a character (splitchar) which is part of
* some key(s). In a compressed branch (one that only contain
* a single string key) the trailer of the key which is not
* already in nodes is stored externally in the kv array.
* As items are inserted, key substrings decrease.
* Some substrings may completely disappear when the whole
* branch is totally decompressed.
* The tree is traversed to find the key substrings actually
* used. In addition, duplicate substrings are removed using
* a map (implemented with a TernaryTree!).
*
*/
public void trimToSize() {
// first balance the tree for best performance
balance();
// redimension the node arrays
redimNodeArrays(freenode);
// ok, compact kv array
CharVector kx = new CharVector();
kx.alloc(1);
TernaryTree map = new TernaryTree();
compact(kx, map, root);
kv = kx;
kv.trimToSize();
}
private void compact(CharVector kx, TernaryTree map, char p) {
int k;
if (p == 0) {
return;
}
if (sc[p] == 0xFFFF) {
k = map.find(kv.getArray(), lo[p]);
if (k < 0) {
k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
map.insert(kx.getArray(), k, (char)k);
}
lo[p] = (char)k;
} else {
compact(kx, map, lo[p]);
if (sc[p] != 0) {
compact(kx, map, eq[p]);
}
compact(kx, map, hi[p]);
}
}
public Enumeration keys() {
return new Iterator();
}
public class Iterator implements Enumeration {
/**
* current node index
*/
int cur;
/**
* current key
*/
String curkey;
private class Item implements Cloneable {
char parent;
char child;
public Item() {
parent = 0;
child = 0;
}
public Item(char p, char c) {
parent = p;
child = c;
}
public Object clone() {
return new Item(parent, child);
}
}
/**
* Node stack
*/
Stack ns;
/**
* key stack implemented with a StringBuffer
*/
StringBuffer ks;
public Iterator() {
cur = -1;
ns = new Stack();
ks = new StringBuffer();
rewind();
}
public void rewind() {
ns.removeAllElements();
ks.setLength(0);
cur = root;
run();
}
public Object nextElement() {
String res = new String(curkey);
cur = up();
run();
return res;
}
public char getValue() {
if (cur >= 0) {
return eq[cur];
}
return 0;
}
public boolean hasMoreElements() {
return (cur != -1);
}
/**
* traverse upwards
*/
private int up() {
Item i = new Item();
int res = 0;
if (ns.empty()) {
return -1;
}
if (cur != 0 && sc[cur] == 0) {
return lo[cur];
}
boolean climb = true;
while (climb) {
i = (Item)ns.pop();
i.child++;
switch (i.child) {
case 1:
if (sc[i.parent] != 0) {
res = eq[i.parent];
ns.push(i.clone());
ks.append(sc[i.parent]);
} else {
i.child++;
ns.push(i.clone());
res = hi[i.parent];
}
climb = false;
break;
case 2:
res = hi[i.parent];
ns.push(i.clone());
if (ks.length() > 0) {
ks.setLength(ks.length() - 1); // pop
}
climb = false;
break;
default:
if (ns.empty()) {
return -1;
}
climb = true;
break;
}
}
return res;
}
/**
* traverse the tree to find next key
*/
private int run() {
if (cur == -1) {
return -1;
}
boolean leaf = false;
while (true) {
// first go down on low branch until leaf or compressed branch
while (cur != 0) {
if (sc[cur] == 0xFFFF) {
leaf = true;
break;
}
ns.push(new Item((char)cur, '\u0000'));
if (sc[cur] == 0) {
leaf = true;
break;
}
cur = lo[cur];
}
if (leaf) {
break;
}
// nothing found, go up one node and try again
cur = up();
if (cur == -1) {
return -1;
}
}
// The current node should be a data node and
// the key should be in the key stack (at least partially)
StringBuffer buf = new StringBuffer(ks.toString());
if (sc[cur] == 0xFFFF) {
int p = lo[cur];
while (kv.get(p) != 0) {
buf.append(kv.get(p++));
}
}
curkey = buf.toString();
return 0;
}
}
public void printStats() {
System.out.println("Number of keys = " + Integer.toString(length));
System.out.println("Node count = " + Integer.toString(freenode));
// System.out.println("Array length = " + Integer.toString(eq.length));
System.out.println("Key Array length = "
+ Integer.toString(kv.length()));
/*
* for(int i=0; i<kv.length(); i++)
* if ( kv.get(i) != 0 )
* System.out.print(kv.get(i));
* else
* System.out.println("");
* System.out.println("Keys:");
* for(Enumeration enum = keys(); enum.hasMoreElements(); )
* System.out.println(enum.nextElement());
*/
}
public static void main(String[] args) throws Exception {
TernaryTree tt = new TernaryTree();
tt.insert("Carlos", 'C');
tt.insert("Car", 'r');
tt.insert("palos", 'l');
tt.insert("pa", 'p');
tt.trimToSize();
System.out.println((char)tt.find("Car"));
System.out.println((char)tt.find("Carlos"));
System.out.println((char)tt.find("alto"));
tt.printStats();
}
}
1.26 +2 -2 xml-fop/src/java/org/apache/fop/layoutmgr/LineLayoutManager.java
Index: LineLayoutManager.java
===================================================================
RCS file: /home/cvs/xml-fop/src/java/org/apache/fop/layoutmgr/LineLayoutManager.java,v
retrieving revision 1.25
retrieving revision 1.26
diff -u -r1.25 -r1.26
--- LineLayoutManager.java 5 Sep 2004 18:16:32 -0000 1.25
+++ LineLayoutManager.java 6 Sep 2004 18:03:12 -0000 1.26
@@ -24,8 +24,8 @@
import org.apache.fop.fo.Constants;
import org.apache.fop.fo.properties.CommonMarginBlock;
import org.apache.fop.fo.properties.CommonHyphenation;
-import org.apache.fop.layout.hyphenation.Hyphenation;
-import org.apache.fop.layout.hyphenation.Hyphenator;
+import org.apache.fop.hyphenation.Hyphenation;
+import org.apache.fop.hyphenation.Hyphenator;
import org.apache.fop.traits.BlockProps;
import org.apache.fop.area.LineArea;
import org.apache.fop.area.Resolveable;
1.3 +2 -2 xml-fop/src/java/org/apache/fop/tools/anttasks/SerializeHyphPattern.java
Index: SerializeHyphPattern.java
===================================================================
RCS file: /home/cvs/xml-fop/src/java/org/apache/fop/tools/anttasks/SerializeHyphPattern.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- SerializeHyphPattern.java 27 Feb 2004 17:56:04 -0000 1.2
+++ SerializeHyphPattern.java 6 Sep 2004 18:03:12 -0000 1.3
@@ -28,8 +28,8 @@
import org.apache.tools.ant.DirectoryScanner;
// FOP
-import org.apache.fop.layout.hyphenation.HyphenationTree;
-import org.apache.fop.layout.hyphenation.HyphenationException;
+import org.apache.fop.hyphenation.HyphenationTree;
+import org.apache.fop.hyphenation.HyphenationException;
/**
* SerializeHyphPattern
---------------------------------------------------------------------
To unsubscribe, e-mail: fop-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: fop-cvs-help@xml.apache.org