You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@directory.apache.org by el...@apache.org on 2006/12/26 09:37:25 UTC
svn commit: r490270 [4/4] - in
/directory/sandbox/elecharny/trunks/shared/ldap/src/main:
java/org/apache/directory/shared/ldap/schema/
java/org/apache/directory/shared/ldap/util/unicode/ resources/
resources/org/ resources/org/apache/ resources/org/apa...
Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,36 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License. For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.directory.shared.ldap.util.unicode;
+
+import java.io.IOException;
+
+public class InvalidCharacterException
+ extends IOException {
+
+ private static final long serialVersionUID = -7150645484748059676L;
+ private int input;
+
+ public InvalidCharacterException(int input) {
+ this.input = input;
+ }
+
+ @Override
+ public String getMessage() {
+ return "Invalid Character 0x" + Integer.toHexString(input);
+ }
+}
\ No newline at end of file
Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,175 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License. For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.directory.shared.ldap.util.unicode;
+
+import java.io.IOException;
+
+/**
+ * Performs Unicode Normalization (Form D,C,KD and KC)
+ */
+public final class Normalizer {
+
+ public enum Mask {
+ NONE,
+ COMPATIBILITY,
+ COMPOSITION
+ }
+
+ public enum Form {
+ D,
+ C(Mask.COMPOSITION),
+ KD(Mask.COMPATIBILITY),
+ KC(Mask.COMPATIBILITY,Mask.COMPOSITION);
+
+ private int mask = 0;
+
+ Form(Mask... masks) {
+ for (Mask mask : masks) {
+ this.mask |= (mask.ordinal());
+ }
+ }
+
+ public boolean isCompatibility() {
+ return (mask & (Mask.COMPATIBILITY.ordinal())) != 0;
+ }
+
+ public boolean isCanonical() {
+ return !isCompatibility();
+ }
+
+ public boolean isComposition() {
+ return (mask & (Mask.COMPOSITION.ordinal())) != 0;
+ }
+ }
+
+ private Normalizer() {}
+
+ /**
+ * Normalize the string using NFKC
+ */
+ public static StringBuilder normalize(String source) throws IOException {
+ return normalize(source, Form.KC);
+ }
+
+ /**
+ * Normalize the string using the specified Form
+ */
+ public static StringBuilder normalize(
+ String source,
+ Form form)
+ throws IOException {
+ return normalize(source, form, new StringBuilder());
+ }
+
+ /**
+ * Normalize the string into the given StringBuffer using the given Form
+ */
+ public static StringBuilder normalize(
+ String source,
+ Form form,
+ StringBuilder buf)
+ throws IOException {
+ UnicodeCharacterDatabase ucd = UnicodeCharacterDatabase.getInstance();
+ if (source.length() != 0 && ucd != null) {
+ decompose(ucd, source, form, buf);
+ compose(ucd, form, buf);
+ }
+ return buf;
+ }
+
+ private static void decompose(
+ UnicodeCharacterDatabase ucd,
+ String source,
+ Form form,
+ StringBuilder buf)
+ throws IOException {
+ StringBuffer internal = new StringBuffer();
+ CodepointIterator ci = CodepointIterator.forCharSequence(source);
+ boolean canonical = form.isCanonical();
+ while (ci.hasNext()) {
+ int c = ci.next();
+ internal.setLength(0);
+ ucd.decompose(c, canonical, internal);
+ CodepointIterator ii = CodepointIterator.forCharSequence(internal);
+ while(ii.hasNext()) {
+ int ch = ii.next();
+ int i = findInsertionPoint(ucd, buf, ch);
+ buf.insert(i,CharUtils.toString(ch));
+ }
+ }
+
+ }
+
+ private static int findInsertionPoint(
+ UnicodeCharacterDatabase ucd,
+ StringBuilder buf, int c) {
+ int cc = ucd.getCanonicalClass(c);
+ int i = buf.length();
+ if (cc != 0) {
+ int ch;
+ for (; i > 0; i -= CharUtils.size(c)) {
+ ch = CharUtils.charAt(buf, i-1);
+ if (ucd.getCanonicalClass(ch) <= cc) break;
+ }
+ }
+ return i;
+ }
+
+ private static void compose(
+ UnicodeCharacterDatabase ucd,
+ Form form,
+ StringBuilder buf)
+ throws IOException {
+ if (!form.isComposition()) return;
+ int pos = 0;
+ int lc = CharUtils.charAt(buf, pos);
+ int cpos = CharUtils.size(lc);
+ int lcc = ucd.getCanonicalClass(lc);
+ if (lcc != 0) lcc = 256;
+ int len = buf.length();
+ int c;
+ for (int dpos = cpos; dpos < buf.length(); dpos += CharUtils.size(c)) {
+ c = CharUtils.charAt(buf,dpos);
+ int cc = ucd.getCanonicalClass(c);
+ int composite = ucd.getPairComposition(lc, c);
+ if (composite != '\uFFFF' && (lcc < cc || lcc == 0)) {
+ CharUtils.setChar(buf, pos, composite);
+ lc = composite;
+ } else {
+ if (cc == 0) {
+ pos = cpos;
+ lc = c;
+ }
+ lcc = cc;
+ CharUtils.setChar(buf,cpos,c);
+ if (buf.length() != len) {
+ dpos += buf.length() - len;
+ len = buf.length();
+ }
+ cpos += CharUtils.size(c);
+ }
+ }
+ buf.setLength(cpos);
+ }
+
+ public static void main(String... args) throws Exception {
+
+ UnicodeCharacterDatabase.main("src/org/apache/abdera/util/unicode/data/ucd.res");
+
+ }
+}
\ No newline at end of file
Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,120 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License. For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.directory.shared.ldap.util.unicode;
+
+import java.util.BitSet;
+
+/**
+ * A CodepointIterator implementation that checks output against a BitSet.
+ * If the iterator is set to "scanning only", the iterator will return -1
+ * upon encountering a codepoint not in the set, otherwise the iterator
+ * will throw an InvalidCharacterException
+ */
+public class RestrictedCodepointIterator
+ extends FilterCodepointIterator {
+
+ private BitSet bitset;
+ private boolean scanningOnly = false;
+ private boolean notset = false;
+
+ protected RestrictedCodepointIterator(
+ CodepointIterator internal,
+ BitSet bitset) {
+ this(internal,bitset,false);
+ }
+
+ protected RestrictedCodepointIterator(
+ CodepointIterator internal,
+ BitSet bitset,
+ boolean scanningOnly) {
+ this(internal, bitset, scanningOnly, false);
+ }
+
+ protected RestrictedCodepointIterator(
+ CodepointIterator internal,
+ BitSet bitset,
+ boolean scanningOnly,
+ boolean notset) {
+ super(internal);
+ this.bitset = bitset;
+ this.scanningOnly = scanningOnly;
+ this.notset = notset;
+ }
+
+ public boolean hasNext() {
+ boolean b = super.hasNext();
+ if (scanningOnly) {
+ try {
+ int cp = peek(position());
+ if (b && cp != -1 && check(cp)) return false;
+ } catch (InvalidCharacterException e) { return false; }
+ }
+ return b;
+ }
+
+ @Override
+ public int next() throws InvalidCharacterException {
+ int cp = super.next();
+ if (cp != -1 && check(cp)) {
+ if (scanningOnly) {
+ position(position()-1);
+ return -1;
+ }
+ else throw new InvalidCharacterException(cp);
+ }
+ return cp;
+ }
+
+ private boolean check(int cp) {
+ return (!notset) ? !bitset.get(cp) : bitset.get(cp);
+ }
+
+ @Override
+ public char[] nextChars() throws InvalidCharacterException {
+ char[] chars = super.nextChars();
+ if (chars != null && chars.length > 0) {
+ if (chars.length == 1 && check(chars[0])) {
+ if (scanningOnly) {
+ position(position()-1);
+ return null;
+ }
+ else throw new InvalidCharacterException(chars[0]);
+ } else if (chars.length == 2) {
+ int cp = CharUtils.toCodePoint(chars);
+ if (check(cp)) {
+ if (scanningOnly) {
+ position(position()-2);
+ return null;
+ }
+ else throw new InvalidCharacterException(cp);
+ }
+ }
+ }
+ return chars;
+ }
+
+ public static void main(String... args) throws Exception {
+
+ ChainableBitSet set = new ChainableBitSet().set2('a','b','c');
+ char[] c = {'a','b','c',CharUtils.getHighSurrogate(0x10000),CharUtils.getLowSurrogate(0x10000)};
+
+ CodepointIterator ci = CodepointIterator.forCharArray(c);
+ RestrictedCodepointIterator rci = new RestrictedCodepointIterator(ci,set,false,true);
+ while(rci.hasNext()) System.out.println(rci.next());
+ }
+}
Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. The ASF licenses this file to You
+ * under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. For additional information regarding
+ * copyright in this work, please see the NOTICE file in the top level
+ * directory of this distribution.
+ */
+package org.apache.directory.shared.ldap.util.unicode;
+
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+
+/**
+ * An implementation of the Unicode Character Database modeled after the
+ * sample normalization demo available at:
+ *
+ * http://www.unicode.org/unicode/reports/tr15/Normalizer.html
+ *
+ * for now, this has been implemented and tested against Unicode 3.2.0. We
+ * need to test is against Unicode 4.0.
+ */
+final class UnicodeCharacterDatabase implements Serializable, Cloneable
+{
+
+ private static final long serialVersionUID = 1596950870716625345L;
+
+ private static final String UCD = "org/apache/directory/shared/ldap/util/unicode/data/ucd.res";
+
+ private final HashMap<Integer, Integer> cc = new HashMap<Integer, Integer>();
+ private final HashMap<Integer, String> decompose = new HashMap<Integer, String>();
+ private final HashMap<Integer, Integer> compose = new HashMap<Integer, Integer>();
+ private final BitSet compatibility = new BitSet();
+ private final BitSet excluded = new BitSet();
+
+ private static UnicodeCharacterDatabase ucd = null;
+
+
+ public synchronized static UnicodeCharacterDatabase getInstance()
+ {
+ if ( ucd == null )
+ {
+ try
+ {
+ ucd = load();
+ }
+ catch ( Exception e )
+ {
+ }
+ }
+ return ucd;
+ }
+
+
+ UnicodeCharacterDatabase()
+ {
+ }
+
+
+ public int getCanonicalClass( int c )
+ {
+ return ( cc.containsKey( c ) ) ? cc.get( c ) : 0;
+ }
+
+
+ public boolean isComposite( int f, int s )
+ {
+ return !( f < 0 || f > 0x10FFFF || s < 0 || s > 0x10FFFF );
+ }
+
+
+ public char getPairComposition( int f, int s )
+ {
+ if ( f < 0 || s > 0x10FFFF || s < 0 || s > 0x10FFFF )
+ return '\uFFFF';
+ Integer i = compose.get( ( f << 16 ) | s );
+ return ( i != null ) ? ( char ) i.intValue() : '\uFFFF';
+ }
+
+
+ public void decompose( int c, boolean canonical, StringBuffer buf )
+ {
+ String d = decompose.get( c );
+ if ( d != null && !( canonical && compatibility.get( c ) ) )
+ {
+ for ( int i = 0; i < d.length(); ++i )
+ {
+ decompose( d.charAt( i ), canonical, buf );
+ }
+ }
+ else
+ CharUtils.append( buf, c );
+ }
+
+
+ public Object clone() throws CloneNotSupportedException
+ {
+ return super.clone();
+ }
+
+
+ public static UnicodeCharacterDatabase load() throws IOException, ClassNotFoundException
+ {
+ ClassLoader cl = Thread.currentThread().getContextClassLoader();
+ InputStream is = cl.getResourceAsStream( UCD );
+ GZIPInputStream gzip = new GZIPInputStream( is );
+ ObjectInputStream ois = new ObjectInputStream( gzip );
+ UnicodeCharacterDatabase ucd = ( UnicodeCharacterDatabase ) ois.readObject();
+ ois.close();
+ gzip.close();
+ is.close();
+ return ucd;
+ }
+
+
+ private static void save( UnicodeCharacterDatabase ucd, String to ) throws IOException
+ {
+ FileOutputStream fos = new FileOutputStream( to );
+ GZIPOutputStream gzip = new GZIPOutputStream( fos );
+ ObjectOutputStream oos = new ObjectOutputStream( gzip );
+ oos.writeObject( ucd );
+ oos.close();
+ gzip.close();
+ fos.close();
+ }
+
+ private static String base;
+ private static String version;
+
+
+ /**
+ * Load the Unicode Character Database from the source files and save as
+ * a gzip compressed, serialized Java class.
+ */
+ public static void main( String... args ) throws Exception
+ {
+ if ( args.length == 0 )
+ {
+ usage();
+ }
+
+ base = ( args.length > 1 ) ? args[1] : UCD;
+ version = ( args.length > 2 ) ? args[2] : "3.2.0";
+ UnicodeCharacterDatabase ucd = UnicodeCharacterDatabase.getInstance();
+
+ if ( ucd == null )
+ {
+ ucd = new UnicodeCharacterDatabase();
+ Loader.load( ucd );
+ }
+
+ save( ucd, args[0] );
+ }
+
+
+ private static void usage()
+ {
+ System.out
+ .println( "Usage:\n java -cp $CLASSPATH com.ibm.usmall.UnicodeCharacterDatabase $filename $datapath" );
+ System.exit( 0 );
+ }
+
+ private static class Loader
+ {
+
+ private static final String EXCLUSIONS = "CompositionExclusions";
+ private static final String UNICODEDATA = "UnicodeData";
+
+
+ // private static final String EXCLUSIONS =
+ // "org/apache/abdera/util/unicode/data/CompositionExclusions-3.2.0.txt";
+ //
+ // private static final String UNICODEDATA =
+ // "org/apache/abdera/util/unicode/data/UnicodeData-3.2.0.txt";
+
+ static String filename( String target )
+ {
+ return base + ( !base.endsWith( "/" ) ? "/" : "" ) + target + "-" + version + ".txt";
+ }
+
+
+ static void load( UnicodeCharacterDatabase ucd ) throws IOException
+ {
+ exclusions( ucd );
+ decomposition( ucd );
+ }
+
+
+ static String stripcomments( String s )
+ {
+ int n = s.indexOf( '#' );
+ return ( n != -1 ) ? s.substring( 0, n ) : s;
+ }
+
+
+ static void exclusions( UnicodeCharacterDatabase ucd ) throws IOException
+ {
+ BufferedReader r = read( filename( EXCLUSIONS ) );
+ String line = null;
+ while ( ( line = r.readLine() ) != null )
+ {
+ line = stripcomments( line );
+ if ( line.length() == 0 )
+ continue;
+ int v = Integer.parseInt( line.trim(), 16 );
+ ucd.excluded.set( v );
+ }
+ r.close();
+ }
+
+
+ static String dehex( String t )
+ {
+ String[] ts = t.split( " " );
+ StringBuffer buf = new StringBuffer();
+ for ( String token : ts )
+ {
+ if ( token.charAt( 0 ) != '<' )
+ {
+ int n = Integer.parseInt( token.trim(), 16 );
+ buf.append( ( char ) n );
+ }
+ }
+ return buf.toString();
+ }
+
+
+ static void decomposition( UnicodeCharacterDatabase ucd ) throws IOException
+ {
+ BufferedReader r = read( filename( UNICODEDATA ) );
+ String line = null;
+ while ( ( line = r.readLine() ) != null )
+ {
+ line = stripcomments( line );
+ if ( line.length() == 0 )
+ continue;
+ String[] tokens = line.split( ";" );
+ int val = Integer.parseInt( tokens[0], 16 );
+ int cc = Integer.parseInt( tokens[3] );
+ ucd.cc.put( val, cc );
+ String decomp = tokens[5];
+ if ( decomp.length() != 0 )
+ {
+ if ( decomp.startsWith( "<" ) )
+ {
+ ucd.compatibility.set( val );
+ }
+ decomp = dehex( decomp );
+ ucd.decompose.put( val, decomp );
+ if ( !ucd.compatibility.get( val ) && !ucd.excluded.get( val ) )
+ {
+ char f = ( decomp.length() > 1 ) ? decomp.charAt( 0 ) : '\u0000';
+ char l = ( decomp.length() > 1 ) ? decomp.charAt( 1 ) : decomp.charAt( 0 );
+ ucd.compose.put( ( f << 16 ) | l, val );
+ }
+ }
+ }
+ hanguls( ucd );
+ r.close();
+ }
+
+
+ // Use the algorithm used in http://www.unicode.org/unicode/reports/tr15/NormalizerBuilder.java
+ static void hanguls( UnicodeCharacterDatabase ucd ) throws IOException
+ {
+ for ( int s = 0; s < 0x2BA4; ++s )
+ {
+ int t = s % 0x001C;
+ char f = ( t != 0 ) ? ( char ) ( 0xAC00 + s - t ) : ( char ) ( 0x1100 + s / 0x024C );
+ char e = ( t != 0 ) ? ( char ) ( 0x11A7 + t ) : ( char ) ( 0x1161 + ( s % 0x024C ) / 0x001C );
+ int pair = ( f << 16 ) | e;
+ int value = s + 0xAC00;
+ ucd.decompose.put( value, String.valueOf( f ) + e );
+ ucd.compose.put( pair, value );
+ }
+ }
+
+
+ static BufferedReader read( String f )
+ {
+ ClassLoader cl = Thread.currentThread().getContextClassLoader();
+ InputStream in = cl.getResourceAsStream( f );
+ InputStreamReader r = new InputStreamReader( in );
+ BufferedReader buf = new BufferedReader( r );
+ return buf;
+ }
+
+ }
+
+}