You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@directory.apache.org by el...@apache.org on 2006/12/26 09:37:25 UTC
svn commit: r490270 [4/4] - in /directory/sandbox/elecharny/trunks/shared/ldap/src/main: java/org/apache/directory/shared/ldap/schema/ java/org/apache/directory/shared/ldap/util/unicode/ resources/ resources/org/ resources/org/apache/ resources/org/apa...

Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/InvalidCharacterException.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,36 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.directory.shared.ldap.util.unicode;
+
+import java.io.IOException;
+
+public class InvalidCharacterException 
+  extends IOException {
+
+  private static final long serialVersionUID = -7150645484748059676L;
+  private int input;
+  
+  public InvalidCharacterException(int input) {
+    this.input = input;
+  }
+
+  @Override
+  public String getMessage() {
+    return "Invalid Character 0x" + Integer.toHexString(input);
+  }
+}
\ No newline at end of file

Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/Normalizer.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,175 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.directory.shared.ldap.util.unicode;
+
+import java.io.IOException;
+
+/**
+ * Performs Unicode Normalization (Form D,C,KD and KC)
+ */
+public final class Normalizer {
+
+  public enum Mask {
+    NONE,
+    COMPATIBILITY,
+    COMPOSITION
+  }
+  
+  public enum Form { 
+    D, 
+    C(Mask.COMPOSITION), 
+    KD(Mask.COMPATIBILITY), 
+    KC(Mask.COMPATIBILITY,Mask.COMPOSITION);
+    
+    private int mask = 0;
+
+    Form(Mask... masks) {
+      for (Mask mask : masks) {
+        this.mask |= (mask.ordinal());
+      }
+    }
+    
+    public boolean isCompatibility() {
+      return (mask & (Mask.COMPATIBILITY.ordinal())) != 0;
+    }
+    
+    public boolean isCanonical() {
+      return !isCompatibility();
+    }
+    
+    public boolean isComposition() {
+      return (mask & (Mask.COMPOSITION.ordinal())) != 0;
+    }
+  }
+  
+  private Normalizer() {}
+  
+  /**
+   * Normalize the string using NFKC
+   */
+  public static StringBuilder normalize(String source) throws IOException {
+    return normalize(source, Form.KC);
+  }
+  
+  /**
+   * Normalize the string using the specified Form
+   */
+  public static StringBuilder normalize(
+    String source, 
+    Form form) 
+      throws IOException {
+    return normalize(source, form, new StringBuilder());
+  }
+  
+  /**
+   * Normalize the string into the given StringBuffer using the given Form
+   */
+  public static StringBuilder normalize(
+    String source, 
+    Form form, 
+    StringBuilder buf) 
+      throws IOException {
+      UnicodeCharacterDatabase ucd = UnicodeCharacterDatabase.getInstance();
+      if (source.length() != 0 && ucd != null) {
+        decompose(ucd, source, form, buf);
+        compose(ucd, form, buf);
+      }
+      return buf;
+  }
+  
+  private static void decompose(
+    UnicodeCharacterDatabase ucd,
+    String source, 
+    Form form, 
+    StringBuilder buf) 
+      throws IOException {
+      StringBuffer internal = new StringBuffer();
+      CodepointIterator ci = CodepointIterator.forCharSequence(source);
+      boolean canonical = form.isCanonical();
+      while (ci.hasNext()) {
+        int c = ci.next();
+        internal.setLength(0);
+        ucd.decompose(c, canonical, internal);
+        CodepointIterator ii = CodepointIterator.forCharSequence(internal);
+        while(ii.hasNext()) {
+          int ch = ii.next();
+          int i = findInsertionPoint(ucd, buf, ch);
+          buf.insert(i,CharUtils.toString(ch));
+        }
+      }
+    
+  }
+  
+  private static int findInsertionPoint(
+    UnicodeCharacterDatabase ucd, 
+    StringBuilder buf, int c) {
+    int cc = ucd.getCanonicalClass(c);
+    int i = buf.length();
+    if (cc != 0) {
+      int ch;
+      for (; i > 0; i -= CharUtils.size(c)) {
+        ch = CharUtils.charAt(buf, i-1);
+        if (ucd.getCanonicalClass(ch) <= cc) break;
+      }
+    }
+    return i;
+  }
+  
+  private static void compose(
+    UnicodeCharacterDatabase ucd,
+    Form form, 
+    StringBuilder buf) 
+      throws IOException {
+    if (!form.isComposition()) return;
+    int pos = 0;
+    int lc = CharUtils.charAt(buf, pos);
+    int cpos = CharUtils.size(lc);    
+    int lcc = ucd.getCanonicalClass(lc);
+    if (lcc != 0) lcc = 256;
+    int len = buf.length();
+    int c;
+    for (int dpos = cpos; dpos < buf.length(); dpos += CharUtils.size(c)) {
+      c = CharUtils.charAt(buf,dpos);
+      int cc = ucd.getCanonicalClass(c);
+      int composite = ucd.getPairComposition(lc, c);
+      if (composite != '\uFFFF' && (lcc < cc || lcc == 0)) {
+        CharUtils.setChar(buf, pos, composite);
+        lc = composite;
+      } else {
+        if (cc == 0) {
+          pos = cpos;
+          lc = c;
+        }
+        lcc = cc;
+        CharUtils.setChar(buf,cpos,c);
+        if (buf.length() != len) {
+          dpos += buf.length() - len;
+          len = buf.length();
+        }
+        cpos += CharUtils.size(c);
+      }
+    }
+    buf.setLength(cpos);
+  }
+  
+  public static void main(String... args) throws Exception {
+    
+    UnicodeCharacterDatabase.main("src/org/apache/abdera/util/unicode/data/ucd.res");
+    
+  }
+}
\ No newline at end of file

Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/RestrictedCodepointIterator.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,120 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.directory.shared.ldap.util.unicode;
+
+import java.util.BitSet;
+
+/**
+ * A CodepointIterator implementation that checks output against a BitSet.
+ * If the iterator is set to "scanning only", the iterator will return -1
+ * upon encountering a codepoint not in the set, otherwise the iterator 
+ * will throw an InvalidCharacterException
+ */
+public class RestrictedCodepointIterator 
+  extends FilterCodepointIterator {
+
+  private BitSet bitset;
+  private boolean scanningOnly = false;
+  private boolean notset = false;
+
+  protected RestrictedCodepointIterator(
+    CodepointIterator internal, 
+    BitSet bitset) {
+      this(internal,bitset,false);
+  }
+
+  protected RestrictedCodepointIterator(
+    CodepointIterator internal, 
+    BitSet bitset,
+    boolean scanningOnly) {
+      this(internal, bitset, scanningOnly, false);
+  }
+  
+  protected RestrictedCodepointIterator(
+      CodepointIterator internal, 
+      BitSet bitset,
+      boolean scanningOnly,
+      boolean notset) {
+      super(internal);
+      this.bitset = bitset;
+      this.scanningOnly = scanningOnly;
+      this.notset = notset;
+    }
+
+  public boolean hasNext() {
+    boolean b = super.hasNext();
+    if (scanningOnly) {
+      try {
+        int cp = peek(position());
+        if (b && cp != -1 && check(cp)) return false;
+      } catch (InvalidCharacterException e) { return false; }
+    } 
+    return b;
+  }
+  
+  @Override
+  public int next() throws InvalidCharacterException {
+    int cp = super.next();
+    if (cp != -1 && check(cp)) {
+      if (scanningOnly) {
+        position(position()-1);
+        return -1;
+      }
+      else throw new InvalidCharacterException(cp);
+    }
+    return cp;
+  }
+
+  private boolean check(int cp) {
+    return (!notset) ? !bitset.get(cp) : bitset.get(cp);
+  }
+  
+  @Override
+  public char[] nextChars() throws InvalidCharacterException {
+    char[] chars = super.nextChars();
+    if (chars != null && chars.length > 0) {
+      if (chars.length == 1 && check(chars[0])) {
+        if (scanningOnly) {
+          position(position()-1);
+          return null;
+        }
+        else throw new InvalidCharacterException(chars[0]);
+      } else if (chars.length == 2) {
+        int cp = CharUtils.toCodePoint(chars);
+        if (check(cp)) {
+          if (scanningOnly) {
+            position(position()-2);
+            return null; 
+          }
+          else throw new InvalidCharacterException(cp);
+        }
+      }
+    }
+    return chars;
+  }
+ 
+  public static void main(String... args) throws Exception {
+    
+    ChainableBitSet set = new ChainableBitSet().set2('a','b','c');
+    char[] c = {'a','b','c',CharUtils.getHighSurrogate(0x10000),CharUtils.getLowSurrogate(0x10000)};
+    
+    CodepointIterator ci = CodepointIterator.forCharArray(c);
+    RestrictedCodepointIterator rci = new RestrictedCodepointIterator(ci,set,false,true);
+    while(rci.hasNext()) System.out.println(rci.next());
+  }
+}

Added: directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java
URL: http://svn.apache.org/viewvc/directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java?view=auto&rev=490270
==============================================================================
--- directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java (added)
+++ directory/sandbox/elecharny/trunks/shared/ldap/src/main/unicode/UnicodeCharacterDatabase.java Tue Dec 26 00:37:23 2006
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  The ASF licenses this file to You
+ * under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.  For additional information regarding
+ * copyright in this work, please see the NOTICE file in the top level
+ * directory of this distribution.
+ */
+package org.apache.directory.shared.ldap.util.unicode;
+
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+
+/**
+ * An implementation of the Unicode Character Database modeled after the 
+ * sample normalization demo available at: 
+ * 
+ * http://www.unicode.org/unicode/reports/tr15/Normalizer.html
+ * 
+ * for now, this has been implemented and tested against Unicode 3.2.0.  We 
+ * need to test is against Unicode 4.0.
+ */
+final class UnicodeCharacterDatabase implements Serializable, Cloneable
+{
+
+    private static final long serialVersionUID = 1596950870716625345L;
+
+    private static final String UCD = "org/apache/directory/shared/ldap/util/unicode/data/ucd.res";
+
+    private final HashMap<Integer, Integer> cc = new HashMap<Integer, Integer>();
+    private final HashMap<Integer, String> decompose = new HashMap<Integer, String>();
+    private final HashMap<Integer, Integer> compose = new HashMap<Integer, Integer>();
+    private final BitSet compatibility = new BitSet();
+    private final BitSet excluded = new BitSet();
+
+    private static UnicodeCharacterDatabase ucd = null;
+
+
+    public synchronized static UnicodeCharacterDatabase getInstance()
+    {
+        if ( ucd == null )
+        {
+            try
+            {
+                ucd = load();
+            }
+            catch ( Exception e )
+            {
+            }
+        }
+        return ucd;
+    }
+
+
+    UnicodeCharacterDatabase()
+    {
+    }
+
+
+    public int getCanonicalClass( int c )
+    {
+        return ( cc.containsKey( c ) ) ? cc.get( c ) : 0;
+    }
+
+
+    public boolean isComposite( int f, int s )
+    {
+        return !( f < 0 || f > 0x10FFFF || s < 0 || s > 0x10FFFF );
+    }
+
+
+    public char getPairComposition( int f, int s )
+    {
+        if ( f < 0 || s > 0x10FFFF || s < 0 || s > 0x10FFFF )
+            return '\uFFFF';
+        Integer i = compose.get( ( f << 16 ) | s );
+        return ( i != null ) ? ( char ) i.intValue() : '\uFFFF';
+    }
+
+
+    public void decompose( int c, boolean canonical, StringBuffer buf )
+    {
+        String d = decompose.get( c );
+        if ( d != null && !( canonical && compatibility.get( c ) ) )
+        {
+            for ( int i = 0; i < d.length(); ++i )
+            {
+                decompose( d.charAt( i ), canonical, buf );
+            }
+        }
+        else
+            CharUtils.append( buf, c );
+    }
+
+
+    public Object clone() throws CloneNotSupportedException
+    {
+        return super.clone();
+    }
+
+
+    public static UnicodeCharacterDatabase load() throws IOException, ClassNotFoundException
+    {
+        ClassLoader cl = Thread.currentThread().getContextClassLoader();
+        InputStream is = cl.getResourceAsStream( UCD );
+        GZIPInputStream gzip = new GZIPInputStream( is );
+        ObjectInputStream ois = new ObjectInputStream( gzip );
+        UnicodeCharacterDatabase ucd = ( UnicodeCharacterDatabase ) ois.readObject();
+        ois.close();
+        gzip.close();
+        is.close();
+        return ucd;
+    }
+
+
+    private static void save( UnicodeCharacterDatabase ucd, String to ) throws IOException
+    {
+        FileOutputStream fos = new FileOutputStream( to );
+        GZIPOutputStream gzip = new GZIPOutputStream( fos );
+        ObjectOutputStream oos = new ObjectOutputStream( gzip );
+        oos.writeObject( ucd );
+        oos.close();
+        gzip.close();
+        fos.close();
+    }
+
+    private static String base;
+    private static String version;
+
+
+    /**
+     * Load the Unicode Character Database from the source files and save as 
+     * a gzip compressed, serialized Java class.
+     */
+    public static void main( String... args ) throws Exception
+    {
+        if ( args.length == 0 )
+        {
+            usage();
+        }
+        
+        base = ( args.length > 1 ) ? args[1] : UCD;
+        version = ( args.length > 2 ) ? args[2] : "3.2.0";
+        UnicodeCharacterDatabase ucd = UnicodeCharacterDatabase.getInstance();
+        
+        if ( ucd == null )
+        {
+            ucd = new UnicodeCharacterDatabase();
+            Loader.load( ucd );
+        }
+        
+        save( ucd, args[0] );
+    }
+
+
+    private static void usage()
+    {
+        System.out
+            .println( "Usage:\n  java -cp $CLASSPATH com.ibm.usmall.UnicodeCharacterDatabase $filename $datapath" );
+        System.exit( 0 );
+    }
+
+    private static class Loader
+    {
+
+        private static final String EXCLUSIONS = "CompositionExclusions";
+        private static final String UNICODEDATA = "UnicodeData";
+
+
+        //    private static final String EXCLUSIONS = 
+        //      "org/apache/abdera/util/unicode/data/CompositionExclusions-3.2.0.txt";
+        //    
+        //    private static final String UNICODEDATA =
+        //      "org/apache/abdera/util/unicode/data/UnicodeData-3.2.0.txt";
+
+        static String filename( String target )
+        {
+            return base + ( !base.endsWith( "/" ) ? "/" : "" ) + target + "-" + version + ".txt";
+        }
+
+
+        static void load( UnicodeCharacterDatabase ucd ) throws IOException
+        {
+            exclusions( ucd );
+            decomposition( ucd );
+        }
+
+
+        static String stripcomments( String s )
+        {
+            int n = s.indexOf( '#' );
+            return ( n != -1 ) ? s.substring( 0, n ) : s;
+        }
+
+
+        static void exclusions( UnicodeCharacterDatabase ucd ) throws IOException
+        {
+            BufferedReader r = read( filename( EXCLUSIONS ) );
+            String line = null;
+            while ( ( line = r.readLine() ) != null )
+            {
+                line = stripcomments( line );
+                if ( line.length() == 0 )
+                    continue;
+                int v = Integer.parseInt( line.trim(), 16 );
+                ucd.excluded.set( v );
+            }
+            r.close();
+        }
+
+
+        static String dehex( String t )
+        {
+            String[] ts = t.split( " " );
+            StringBuffer buf = new StringBuffer();
+            for ( String token : ts )
+            {
+                if ( token.charAt( 0 ) != '<' )
+                {
+                    int n = Integer.parseInt( token.trim(), 16 );
+                    buf.append( ( char ) n );
+                }
+            }
+            return buf.toString();
+        }
+
+
+        static void decomposition( UnicodeCharacterDatabase ucd ) throws IOException
+        {
+            BufferedReader r = read( filename( UNICODEDATA ) );
+            String line = null;
+            while ( ( line = r.readLine() ) != null )
+            {
+                line = stripcomments( line );
+                if ( line.length() == 0 )
+                    continue;
+                String[] tokens = line.split( ";" );
+                int val = Integer.parseInt( tokens[0], 16 );
+                int cc = Integer.parseInt( tokens[3] );
+                ucd.cc.put( val, cc );
+                String decomp = tokens[5];
+                if ( decomp.length() != 0 )
+                {
+                    if ( decomp.startsWith( "<" ) )
+                    {
+                        ucd.compatibility.set( val );
+                    }
+                    decomp = dehex( decomp );
+                    ucd.decompose.put( val, decomp );
+                    if ( !ucd.compatibility.get( val ) && !ucd.excluded.get( val ) )
+                    {
+                        char f = ( decomp.length() > 1 ) ? decomp.charAt( 0 ) : '\u0000';
+                        char l = ( decomp.length() > 1 ) ? decomp.charAt( 1 ) : decomp.charAt( 0 );
+                        ucd.compose.put( ( f << 16 ) | l, val );
+                    }
+                }
+            }
+            hanguls( ucd );
+            r.close();
+        }
+
+
+        // Use the algorithm used in http://www.unicode.org/unicode/reports/tr15/NormalizerBuilder.java
+        static void hanguls( UnicodeCharacterDatabase ucd ) throws IOException
+        {
+            for ( int s = 0; s < 0x2BA4; ++s )
+            {
+                int t = s % 0x001C;
+                char f = ( t != 0 ) ? ( char ) ( 0xAC00 + s - t ) : ( char ) ( 0x1100 + s / 0x024C );
+                char e = ( t != 0 ) ? ( char ) ( 0x11A7 + t ) : ( char ) ( 0x1161 + ( s % 0x024C ) / 0x001C );
+                int pair = ( f << 16 ) | e;
+                int value = s + 0xAC00;
+                ucd.decompose.put( value, String.valueOf( f ) + e );
+                ucd.compose.put( pair, value );
+            }
+        }
+
+
+        static BufferedReader read( String f )
+        {
+            ClassLoader cl = Thread.currentThread().getContextClassLoader();
+            InputStream in = cl.getResourceAsStream( f );
+            InputStreamReader r = new InputStreamReader( in );
+            BufferedReader buf = new BufferedReader( r );
+            return buf;
+        }
+
+    }
+
+}