You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2013/09/24 20:33:11 UTC
[35/50] [abbrv] git commit: Some work on Analyzers library

Some work on Analyzers library


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/4cc8ff0e
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/4cc8ff0e
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/4cc8ff0e

Branch: refs/heads/branch_4x
Commit: 4cc8ff0e445b942ee9eb6e0b4db5be60745d047e
Parents: 401752b
Author: Paul Irwin <pa...@gmail.com>
Authored: Wed Aug 7 13:38:11 2013 -0400
Committer: Paul Irwin <pa...@gmail.com>
Committed: Wed Aug 7 13:38:11 2013 -0400

----------------------------------------------------------------------
 src/contrib/Analyzers/Contrib.Analyzers.csproj  |   7 +
 .../Analyzers/Core/KeywordTokenizerFactory.cs   |  15 +-
 src/contrib/Analyzers/Core/LetterTokenizer.cs   |  28 ++
 .../Analyzers/Core/LetterTokenizerFactory.cs    |  27 ++
 src/contrib/Analyzers/Support/AbstractSet.cs    | 120 +++++++
 src/contrib/Analyzers/Util/AnalysisSPILoader.cs | 115 +++++++
 src/contrib/Analyzers/Util/CharArrayMap.cs      | 311 ++++++++++++++++---
 src/contrib/Analyzers/Util/CharArraySet.cs      | 122 +++++++-
 src/contrib/Analyzers/Util/CharTokenizer.cs     | 124 ++++++++
 .../Analyzers/Util/StopwordAnalyzerBase.cs      |  80 +++++
 src/contrib/Analyzers/Util/TokenizerFactory.cs  |  38 +++
 src/contrib/Analyzers/Util/WordlistLoader.cs    | 155 +++++++++
 12 files changed, 1094 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Contrib.Analyzers.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index 8b01198..8613c88 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -132,6 +132,8 @@
     <Compile Include="Core\KeywordAnalyzer.cs" />
     <Compile Include="Core\KeywordTokenizer.cs" />
     <Compile Include="Core\KeywordTokenizerFactory.cs" />
+    <Compile Include="Core\LetterTokenizer.cs" />
+    <Compile Include="Core\LetterTokenizerFactory.cs" />
     <Compile Include="Cz\CzechAnalyzer.cs" />
     <Compile Include="De\GermanAnalyzer.cs" />
     <Compile Include="De\GermanStemFilter.cs" />
@@ -199,15 +201,20 @@
     <Compile Include="Sinks\DateRecognizerSinkFilter.cs" />
     <Compile Include="Sinks\TokenRangeSinkFilter.cs" />
     <Compile Include="Sinks\TokenTypeSinkFilter.cs" />
+    <Compile Include="Support\AbstractSet.cs" />
     <Compile Include="Support\StringExtensions.cs" />
     <Compile Include="Th\ThaiAnalyzer.cs" />
     <Compile Include="Th\ThaiWordFilter.cs" />
     <Compile Include="Util\AbstractAnalysisFactory.cs" />
+    <Compile Include="Util\AnalysisSPILoader.cs" />
     <Compile Include="Util\CharacterUtils.cs" />
     <Compile Include="Util\CharArrayMap.cs" />
     <Compile Include="Util\CharArraySet.cs" />
+    <Compile Include="Util\CharTokenizer.cs" />
     <Compile Include="Util\IResourceLoader.cs" />
+    <Compile Include="Util\StopwordAnalyzerBase.cs" />
     <Compile Include="Util\TokenizerFactory.cs" />
+    <Compile Include="Util\WordlistLoader.cs" />
     <Compile Include="WordlistLoader.cs" />
   </ItemGroup>
   <ItemGroup>

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs b/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
index 24f810c..ac85065 100644
--- a/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
+++ b/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
@@ -1,4 +1,5 @@
-using System;
+using Lucene.Net.Analysis.Util;
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
@@ -7,6 +8,18 @@ namespace Lucene.Net.Analysis.Core
 {
     public class KeywordTokenizerFactory : TokenizerFactory
     {
+        public KeywordTokenizerFactory(IDictionary<String, String> args)
+            : base(args)
+        {
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
 
+        public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+        {
+            return new KeywordTokenizer(factory, input, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+        }
     }
 }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Core/LetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LetterTokenizer.cs b/src/contrib/Analyzers/Core/LetterTokenizer.cs
new file mode 100644
index 0000000..669d8dc
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LetterTokenizer.cs
@@ -0,0 +1,28 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+    public class LetterTokenizer : CharTokenizer
+    {
+        public LetterTokenizer(Version matchVersion, TextReader input)
+            : base(matchVersion, input)
+        {
+        }
+
+        public LetterTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
+            : base(matchVersion, factory, input)
+        {
+        }
+
+        protected override bool IsTokenChar(int c)
+        {
+            return char.IsLetter((char)c);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs b/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs
new file mode 100644
index 0000000..c07a8b6
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs
@@ -0,0 +1,27 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+    public class LetterTokenizerFactory : TokenizerFactory
+    {
+        public LetterTokenizerFactory(IDictionary<String, String> args)
+            : base(args)
+        {
+            AssureMatchVersion();
+
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+        {
+            return new LetterTokenizer(luceneMatchVersion, factory, input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Support/AbstractSet.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Support/AbstractSet.cs b/src/contrib/Analyzers/Support/AbstractSet.cs
new file mode 100644
index 0000000..f732d08
--- /dev/null
+++ b/src/contrib/Analyzers/Support/AbstractSet.cs
@@ -0,0 +1,120 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Support
+{
+    public class AbstractSet<T> : ISet<T>
+    {
+        public virtual bool Add(T item)
+        {
+            return false;
+        }
+
+        public void ExceptWith(IEnumerable<T> other)
+        {
+            foreach (var item in other)
+            {
+                this.Remove(item);
+            }
+        }
+
+        public void IntersectWith(IEnumerable<T> other)
+        {
+            var set = new HashSet<T>(other);
+
+            foreach (var item in this.ToList())
+            {
+                if (!set.Contains(item))
+                    this.Remove(item);
+            }
+        }
+
+        public bool IsProperSubsetOf(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public bool IsProperSupersetOf(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public bool IsSubsetOf(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public bool IsSupersetOf(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public bool Overlaps(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public bool SetEquals(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public void SymmetricExceptWith(IEnumerable<T> other)
+        {
+            throw new NotImplementedException();
+        }
+
+        public void UnionWith(IEnumerable<T> other)
+        {
+            foreach (var item in other)
+            {
+                this.Add(item);
+            }
+        }
+
+        void ICollection<T>.Add(T item)
+        {
+            Add(item);
+        }
+
+        public abstract void Clear();
+
+        public abstract bool Contains(T item);
+
+        public void CopyTo(T[] array, int arrayIndex)
+        {
+            var enumerator = GetEnumerator();
+
+            for (int i = arrayIndex; i < array.Length; i++)
+            {
+                if (!enumerator.MoveNext())
+                    break;
+
+                array[i] = enumerator.Current;
+            }
+        }
+
+        public abstract int Count { get; }
+
+        public bool IsReadOnly
+        {
+            get { return false ; }
+        }
+
+        public abstract bool Remove(T item);
+
+        public abstract IEnumerator<T> GetEnumerator();
+
+        System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
+        {
+            return GetEnumerator();
+        }
+
+        public void AddAll(IEnumerable<T> values)
+        {
+            this.UnionWith(values);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/AnalysisSPILoader.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/AnalysisSPILoader.cs b/src/contrib/Analyzers/Util/AnalysisSPILoader.cs
new file mode 100644
index 0000000..d06a57c
--- /dev/null
+++ b/src/contrib/Analyzers/Util/AnalysisSPILoader.cs
@@ -0,0 +1,115 @@
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading;
+
+namespace Lucene.Net.Analysis.Util
+{
+    internal sealed class AnalysisSPILoader<S>
+        where S : AbstractAnalysisFactory
+    {
+        private volatile IDictionary<string, Type> services = new HashMap<string, Type>();
+        private readonly Type clazz;
+        private readonly string[] suffixes;
+
+        public AnalysisSPILoader(Type clazz)
+            : this(clazz, new string[] { clazz.Name })
+        {
+        }
+
+        public AnalysisSPILoader(Type clazz, string[] suffixes)
+        {
+            this.clazz = clazz;
+            this.suffixes = suffixes;
+            // if clazz' classloader is not a parent of the given one, we scan clazz's classloader, too:
+            //final ClassLoader clazzClassloader = clazz.getClassLoader();
+            //if (clazzClassloader != null && !SPIClassIterator.isParentClassLoader(clazzClassloader, classloader)) {
+            //  reload(clazzClassloader);
+            //}
+            Reload();
+        }
+
+        public void Reload()
+        {
+            lock (this)
+            {
+                HashMap<String, Type> services =
+                  new HashMap<String, Type>(this.services);
+                SPIClassIterator<S> loader = SPIClassIterator<S>.Get();
+                foreach (var service in loader)
+                {
+                    //Class<? extends S> service = loader.next();
+                    String clazzName = service.Name;
+                    String name = null;
+                    foreach (String suffix in suffixes)
+                    {
+                        if (clazzName.EndsWith(suffix))
+                        {
+                            name = clazzName.Substring(0, clazzName.Length - suffix.Length).ToLowerInvariant();
+                            break;
+                        }
+                    }
+                    if (name == null)
+                    {
+                        throw new InvalidOperationException("The class name " + service.FullName +
+                          " has wrong suffix, allowed are: " + Arrays.ToString(suffixes));
+                    }
+                    // only add the first one for each name, later services will be ignored
+                    // this allows to place services before others in classpath to make 
+                    // them used instead of others
+                    //
+                    // TODO: Should we disallow duplicate names here?
+                    // Allowing it may get confusing on collisions, as different packages
+                    // could contain same factory class, which is a naming bug!
+                    // When changing this be careful to allow reload()!
+                    if (!services.ContainsKey(name))
+                    {
+                        services[name] = service;
+                    }
+                }
+                //this.services = Collections.unmodifiableMap(services);
+            }
+        }
+
+        public S NewInstance(string name, IDictionary<string, string> args)
+        {
+            Type service = LookupClass(name);
+            try
+            {
+                //var ctor = service.GetConstructor(new[] { typeof(IDictionary<string, string>) });
+                return (S)Activator.CreateInstance(service, args);
+            }
+            catch (Exception e)
+            {
+                throw new ArgumentException("SPI class of type " + clazz.FullName + " with name '" + name + "' cannot be instantiated. " +
+                      "This is likely due to a misconfiguration of the java class '" + service.FullName + "': ", e);
+            }
+        }
+
+        public Type LookupClass(String name)
+        {
+            Type service = services[name.ToLowerInvariant()];
+            if (service != null)
+            {
+                return service;
+            }
+            else
+            {
+                throw new ArgumentException("A SPI class of type " + clazz.FullName + " with name '" + name + "' does not exist. " +
+                    "You need to add the corresponding JAR file supporting this SPI to your classpath." +
+                    "The current classpath supports the following names: " + Arrays.ToString(AvailableServices));
+            }
+        }
+
+        public ICollection<String> AvailableServices
+        {
+            get
+            {
+                return services.Keys;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/CharArrayMap.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharArrayMap.cs b/src/contrib/Analyzers/Util/CharArrayMap.cs
index 7297855..e124451 100644
--- a/src/contrib/Analyzers/Util/CharArrayMap.cs
+++ b/src/contrib/Analyzers/Util/CharArrayMap.cs
@@ -1,4 +1,5 @@
-using Lucene.Net.Support;
+using Lucene.Net.Analysis.Support;
+using Lucene.Net.Support;
 using System;
 using System.Collections.Generic;
 using System.Linq;
@@ -9,7 +10,7 @@ namespace Lucene.Net.Analysis.Util
     public class CharArrayMap<V> : IDictionary<object, V>
     {
         // private only because missing generics
-        private static readonly CharArrayMap<V> EMPTY_MAP = new EmptyCharArrayMap<Object>();
+        internal static readonly CharArrayMap<V> EMPTY_MAP = new CharArrayMap.EmptyCharArrayMap<V>();
 
         private const int INIT_SIZE = 8;
         private readonly CharacterUtils charUtils;
@@ -34,10 +35,13 @@ namespace Lucene.Net.Analysis.Util
         public CharArrayMap(Lucene.Net.Util.Version matchVersion, IDictionary<object, V> c, bool ignoreCase)
             : this(matchVersion, c.Count, ignoreCase)
         {
-            PutAll(c);
+            foreach (var kvp in c)
+            {
+                this[kvp.Key] = kvp.Value;
+            }
         }
 
-        private CharArrayMap(CharArrayMap<V> toCopy)
+        internal CharArrayMap(CharArrayMap<V> toCopy)
         {
             this.keys = toCopy.keys;
             this.values = toCopy.values;
@@ -47,24 +51,24 @@ namespace Lucene.Net.Analysis.Util
             this.matchVersion = toCopy.matchVersion;
         }
 
-        public void Clear()
+        public virtual void Clear()
         {
             count = 0;
             Arrays.Fill(keys, null);
             Arrays.Fill(values, default(V));
         }
 
-        public bool ContainsKey(char[] text, int off, int len)
+        public virtual bool ContainsKey(char[] text, int off, int len)
         {
             return keys[GetSlot(text, off, len)] != null;
         }
 
-        public bool ContainsKey(ICharSequence cs)
+        public virtual bool ContainsKey(ICharSequence cs)
         {
             return keys[GetSlot(cs)] != null;
         }
 
-        public bool ContainsKey(Object o)
+        public virtual bool ContainsKey(Object o)
         {
             if (o is char[])
             {
@@ -74,34 +78,35 @@ namespace Lucene.Net.Analysis.Util
             return ContainsKey(o.ToString());
         }
 
-        public V Get(char[] text, int off, int len)
+        public virtual V Get(char[] text, int off, int len)
         {
             return values[GetSlot(text, off, len)];
         }
 
-        public V Get(ICharSequence cs)
+        public virtual V Get(ICharSequence cs)
         {
             return values[GetSlot(cs)];
         }
 
+        public virtual V Get(object o)
+        {
+            if (o is char[])
+            {
+                char[] text = (char[])o;
+                return Get(text, 0, text.Length);
+            }
+            return Get(o.ToString());
+        }
+
         public V this[Object o]
         {
             get
             {
-                if (o is char[])
-                {
-                    char[] text = (char[])o;
-                    return Get(text, 0, text.Length);
-                }
-                return this[o.ToString()];
+                return Get(o); 
             }
             set
             {
-                if (o is char[])
-                {
-                    Put((char[])o, value);
-                }
-                Put(o.ToString(), value);
+                Put(o, value);
             }
         }
 
@@ -141,17 +146,26 @@ namespace Lucene.Net.Analysis.Util
             return pos;
         }
 
-        public V Put(ICharSequence text, V value)
+        public virtual V Put(object o, V value)
+        {
+            if (o is char[])
+            {
+                return Put((char[])o, value);
+            }
+            return Put(o.ToString(), value);
+        }
+
+        public virtual V Put(ICharSequence text, V value)
         {
             return Put(text.ToString(), value); // could be more efficient
         }
 
-        public V Put(string text, V value)
+        public virtual V Put(string text, V value)
         {
             return Put(text.ToCharArray(), value);
         }
 
-        public V Put(char[] text, V value)
+        public virtual V Put(char[] text, V value)
         {
             if (ignoreCase)
             {
@@ -300,7 +314,7 @@ namespace Lucene.Net.Analysis.Util
             return code;
         }
 
-        public void Remove(object key)
+        public virtual void Remove(object key)
         {
             throw new NotSupportedException();
         }
@@ -313,7 +327,7 @@ namespace Lucene.Net.Analysis.Util
         public override string ToString()
         {
             StringBuilder sb = new StringBuilder("{");
-            foreach (KeyValuePair<Object, V> entry in EntrySet)
+            foreach (KeyValuePair<Object, V> entry in this.GetEntrySet())
             {
                 if (sb.Length > 1) sb.Append(", ");
                 sb.Append(entry);
@@ -324,28 +338,25 @@ namespace Lucene.Net.Analysis.Util
         private EntrySet entrySet = null;
         private CharArraySet keySet = null;
 
-        internal EntrySet CreateEntrySet()
+        internal virtual EntrySet CreateEntrySet()
         {
-            return new EntrySet(true);
+            return new EntrySet(this, true);
         }
 
-        public EntrySet EntrySet
+        public EntrySet GetEntrySet()
         {
-            get
+            if (entrySet == null)
             {
-                if (entrySet == null)
-                {
-                    entrySet = CreateEntrySet();
-                }
-                return entrySet;
+                entrySet = CreateEntrySet();
             }
+            return entrySet;
         }
 
         internal ISet<object> OriginalKeySet
         {
             get
             {
-                return Keys;
+                return Keys as ISet<object>;
             }
         }
 
@@ -399,7 +410,7 @@ namespace Lucene.Net.Analysis.Util
             private int lastPos;
             private readonly bool allowModify;
 
-            private KeyValuePair<object, V> current; // .NET Port: need to store current as IEnumerator != Iterator
+            private MapEntry current; // .NET Port: need to store current as IEnumerator != Iterator
 
             public EntryIterator(CharArrayMap<V> parent, bool allowModify)
             {
@@ -423,7 +434,7 @@ namespace Lucene.Net.Analysis.Util
 
                     return true;
                 }
-                current = new MapEntry(lastPos, allowModify);
+                current = new MapEntry(parent, lastPos, allowModify);
                 return false;
             }
 
@@ -454,10 +465,10 @@ namespace Lucene.Net.Analysis.Util
                 parent.values[lastPos] = value;
                 return old;
             }
-            
+
             public KeyValuePair<object, V> Current
             {
-                get { return current; }
+                get { return current.AsKeyValuePair(); }
             }
 
             public void Dispose()
@@ -475,6 +486,224 @@ namespace Lucene.Net.Analysis.Util
             }
         }
 
-        
+        private sealed class MapEntry // : KeyValuePair<object, V> -- this doesn't work in .NET as KVP is a struct, so we wrap it instead
+        {
+            private readonly CharArrayMap<V> parent;
+            private readonly int pos;
+            private readonly bool allowModify;
+
+            public MapEntry(CharArrayMap<V> parent, int pos, bool allowModify)
+            {
+                this.parent = parent;
+                this.pos = pos;
+                this.allowModify = allowModify;
+            }
+
+            public object Key
+            {
+                get
+                {
+                    // we must clone here, as putAll to another CharArrayMap
+                    // with other case sensitivity flag would corrupt the keys
+                    return parent.keys[pos].Clone();
+                }
+            }
+
+            public V Value
+            {
+                get
+                {
+                    return parent.values[pos];
+                }
+                set
+                {
+                    if (!allowModify)
+                        throw new NotSupportedException();
+
+                    parent.values[pos] = value;
+                }
+            }
+
+            public override string ToString()
+            {
+                return new StringBuilder().Append(parent.keys[pos]).Append('=')
+                    .Append((parent.values[pos].Equals(parent)) ? "(this Map)" : parent.values[pos].ToString())
+                    .ToString();
+            }
+
+            public KeyValuePair<object, V> AsKeyValuePair()
+            {
+                return new KeyValuePair<object, V>(Key, Value);
+            }
+        }
+
+        public sealed class EntrySet : AbstractSet<KeyValuePair<object, V>>
+        {
+            private readonly CharArrayMap<V> parent;
+            private readonly bool allowModify;
+
+            public EntrySet(CharArrayMap<V> parent, bool allowModify)
+            {
+                this.parent = parent;
+                this.allowModify = allowModify;
+            }
+
+            public override IEnumerator<KeyValuePair<object, V>> GetEnumerator()
+            {
+                return new EntryIterator(parent, allowModify);
+            }
+
+            public override bool Contains(KeyValuePair<object, V> e)
+            {
+                //if (!(o instanceof Map.Entry))
+                //  return false;
+                //Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
+                Object key = e.Key;
+                Object val = e.Value;
+                Object v = parent[key];
+                return v == null ? val == null : v.Equals(val);
+            }
+
+            public override bool Remove(KeyValuePair<object, V> item)
+            {
+                throw new NotSupportedException();
+            }
+
+            public override int Count
+            {
+                get { return parent.count; }
+            }
+
+            public override void Clear()
+            {
+                if (!allowModify)
+                    throw new NotSupportedException();
+                parent.Clear();
+            }
+        }
+    }
+
+    // .NET Port: non-generic static clas to hold nested types and static methods
+    public static class CharArrayMap
+    {
+        public static CharArrayMap<V> UnmodifiableMap<V>(CharArrayMap<V> map)
+        {
+            if (map == null)
+                throw new NullReferenceException("Given map is null");
+            if (map == EmptyMap<V>() || map.Count == 0)
+                return EmptyMap<V>();
+            if (map is UnmodifiableCharArrayMap<V>)
+                return map;
+            return new UnmodifiableCharArrayMap<V>(map);
+        }
+
+        public static CharArrayMap<V> Copy<V>(Lucene.Net.Util.Version matchVersion, IDictionary<object, V> map)
+        {
+            if (map == CharArrayMap<V>.EMPTY_MAP)
+                return EmptyMap<V>();
+            if (map is CharArrayMap<V>)
+            {
+                CharArrayMap<V> m = (CharArrayMap<V>)map;
+                // use fast path instead of iterating all values
+                // this is even on very small sets ~10 times faster than iterating
+                char[][] keys = new char[m.keys.Length][];
+                Array.Copy(m.keys, 0, keys, 0, keys.Length);
+                V[] values = new V[m.values.Length];
+                Array.Copy(m.values, 0, values, 0, values.Length);
+                m = new CharArrayMap<V>(m);
+                m.keys = keys;
+                m.values = values;
+                return m;
+            }
+            return new CharArrayMap<V>(matchVersion, map, false);
+        }
+
+        public static CharArrayMap<V> EmptyMap<V>()
+        {
+            return CharArrayMap<V>.EMPTY_MAP;
+        }
+
+        internal class UnmodifiableCharArrayMap<V> : CharArrayMap<V>
+        {
+            public UnmodifiableCharArrayMap(CharArrayMap<V> map)
+                : base(map)
+            {
+            }
+
+            public override void Clear()
+            {
+                throw new NotSupportedException();
+            }
+
+            public override V Put(char[] text, V value)
+            {
+                throw new NotSupportedException();
+            }
+
+            public override V Put(ICharSequence text, V value)
+            {
+                throw new NotSupportedException();
+            }
+
+            public override V Put(string text, V value)
+            {
+                throw new NotSupportedException();
+            }
+
+            public override void Remove(object key)
+            {
+                throw new NotSupportedException();
+            }
+
+            internal override CharArrayMap<V>.EntrySet CreateEntrySet()
+            {
+                throw new NotSupportedException();
+            }
+        }
+
+        internal sealed class EmptyCharArrayMap<V> : UnmodifiableCharArrayMap<V>
+        {
+            public EmptyCharArrayMap()
+                : base(new CharArrayMap<V>(Lucene.Net.Util.Version.LUCENE_CURRENT, 0, false))
+            {
+            }
+
+            public override bool ContainsKey(char[] text, int off, int len)
+            {
+                if (text == null)
+                    throw new NullReferenceException();
+                return false;
+            }
+
+            public override bool ContainsKey(ICharSequence cs)
+            {
+                if (cs == null)
+                    throw new NullReferenceException();
+                return false;
+            }
+
+            public override bool ContainsKey(object o)
+            {
+                if (o == null)
+                    throw new NullReferenceException();
+                return false;
+            }
+
+            public override V Get(char[] text, int off, int len)
+            {
+                if (text == null)
+                    throw new NullReferenceException();
+                return default(V);
+            }
+
+            public override V Get(ICharSequence cs)
+            {
+                if (cs == null)
+                    throw new NullReferenceException();
+                return default(V);
+            }
+
+
+        }
     }
 }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/CharArraySet.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharArraySet.cs b/src/contrib/Analyzers/Util/CharArraySet.cs
index 6dcc486..522bcaa 100644
--- a/src/contrib/Analyzers/Util/CharArraySet.cs
+++ b/src/contrib/Analyzers/Util/CharArraySet.cs
@@ -1,15 +1,125 @@
-using System;
+using Lucene.Net.Analysis.Support;
+using Lucene.Net.Support;
+using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 
 namespace Lucene.Net.Analysis.Util
 {
-    public class CharArraySet : ISet<object>
+    public class CharArraySet : AbstractSet<object>
     {
-        public static readonly CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
-  private static readonly object PLACEHOLDER = new object();
-  
-  private readonly CharArrayMap<Object> map;
+        public static readonly CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.EmptyMap<object>());
+        private static readonly object PLACEHOLDER = new object();
+
+        private readonly CharArrayMap<object> map;
+
+        public CharArraySet(Lucene.Net.Util.Version matchVersion, int startSize, bool ignoreCase)
+            : this(new CharArrayMap<Object>(matchVersion, startSize, ignoreCase))
+        {
+        }
+
+        public CharArraySet(Lucene.Net.Util.Version matchVersion, ICollection<object> c, bool ignoreCase)
+            : this(matchVersion, c.Count, ignoreCase)
+        {
+            AddAll(c);
+        }
+
+        internal CharArraySet(CharArrayMap<Object> map)
+        {
+            this.map = map;
+        }
+
+        public override void Clear()
+        {
+            map.Clear();
+        }
+
+        public bool Contains(char[] text, int off, int len)
+        {
+            return map.ContainsKey(text, off, len);
+        }
+
+        public bool Contains(ICharSequence cs)
+        {
+            return map.ContainsKey(cs);
+        }
+
+        public override bool Contains(object o)
+        {
+            return map.ContainsKey(o);
+        }
+
+        public override bool Add(object o)
+        {
+            return map.Put(o, PLACEHOLDER) == null;
+        }
+
+        public bool Add(ICharSequence text)
+        {
+            return map.Put(text, PLACEHOLDER) == null;
+        }
+
+        public bool Add(string text)
+        {
+            return map.Put(text, PLACEHOLDER) == null;
+        }
+
+        public bool Add(char[] text)
+        {
+            return map.Put(text, PLACEHOLDER) == null;
+        }
+
+        public override int Count
+        {
+            get { return map.Count; }
+        }
+
+        public static CharArraySet UnmodifiableSet(CharArraySet set)
+        {
+            if (set == null)
+                throw new NullReferenceException("Given set is null");
+            if (set == EMPTY_SET)
+                return EMPTY_SET;
+            if (set.map is CharArrayMap.UnmodifiableCharArrayMap<object>)
+                return set;
+            return new CharArraySet(CharArrayMap.UnmodifiableMap(set.map));
+        }
+
+        public static CharArraySet Copy(Lucene.Net.Util.Version matchVersion, ICollection<object> set)
+        {
+            if (set == EMPTY_SET)
+                return EMPTY_SET;
+            if (set is CharArraySet)
+            {
+                CharArraySet source = (CharArraySet)set;
+                return new CharArraySet(CharArrayMap.Copy(source.map.matchVersion, source.map));
+            }
+            return new CharArraySet(matchVersion, set, false);
+        }
+
+        public override IEnumerator<object> GetEnumerator()
+        {
+            // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+            return map.OriginalKeySet.GetEnumerator();
+        }
+
+        public override string ToString()
+        {
+            StringBuilder sb = new StringBuilder("[");
+            foreach (Object item in this)
+            {
+                if (sb.Length > 1) sb.Append(", ");
+                if (item is char[])
+                {
+                    sb.Append((char[])item);
+                }
+                else
+                {
+                    sb.Append(item);
+                }
+            }
+            return sb.Append(']').ToString();
+        }
     }
 }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/CharTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharTokenizer.cs b/src/contrib/Analyzers/Util/CharTokenizer.cs
new file mode 100644
index 0000000..b0029fa
--- /dev/null
+++ b/src/contrib/Analyzers/Util/CharTokenizer.cs
@@ -0,0 +1,124 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Util
+{
+    public abstract class CharTokenizer : Tokenizer
+    {
+        public CharTokenizer(Version matchVersion, TextReader input)
+            : base(input)
+        {
+            charUtils = CharacterUtils.GetInstance(matchVersion);
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+        }
+
+        public CharTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
+            : base(factory, input)
+        {
+            charUtils = CharacterUtils.GetInstance(matchVersion);
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+        }
+
+        // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
+        private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
+        private const int MAX_WORD_LEN = 255;
+        private const int IO_BUFFER_SIZE = 4096;
+
+        private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+        private readonly IOffsetAttribute offsetAtt; // = addAttribute(OffsetAttribute.class);
+
+        private readonly CharacterUtils charUtils;
+        private readonly CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.NewCharacterBuffer(IO_BUFFER_SIZE);
+
+        protected abstract bool IsTokenChar(int c);
+
+        protected virtual int Normalize(int c)
+        {
+            return c;
+        }
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            int length = 0;
+            int start = -1; // this variable is always initialized
+            int end = -1;
+            char[] buffer = termAtt.Buffer;
+            while (true)
+            {
+                if (bufferIndex >= dataLen)
+                {
+                    offset += dataLen;
+                    if (!charUtils.Fill(ioBuffer, input))
+                    { // read supplementary char aware with CharacterUtils
+                        dataLen = 0; // so next offset += dataLen won't decrement offset
+                        if (length > 0)
+                        {
+                            break;
+                        }
+                        else
+                        {
+                            finalOffset = CorrectOffset(offset);
+                            return false;
+                        }
+                    }
+                    dataLen = ioBuffer.Length;
+                    bufferIndex = 0;
+                }
+                // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
+                int c = charUtils.CodePointAt(ioBuffer.Buffer, bufferIndex);
+                int charCount = Character.CharCount(c);
+                bufferIndex += charCount;
+
+                if (IsTokenChar(c))
+                {               // if it's a token char
+                    if (length == 0)
+                    {                // start of token
+                        //assert start == -1;
+                        start = offset + bufferIndex - charCount;
+                        end = start;
+                    }
+                    else if (length >= buffer.Length - 1)
+                    { // check if a supplementary could run out of bounds
+                        buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer
+                    }
+                    end += charCount;
+                    length += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized
+                    if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
+                        break;
+                }
+                else if (length > 0)             // at non-Letter w/ chars
+                    break;                           // return 'em
+            }
+
+            termAtt.SetLength(length);
+            //assert start != -1;
+            offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end));
+            return true;
+
+        }
+
+        public override void End()
+        {
+            // set final offset
+            offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            bufferIndex = 0;
+            offset = 0;
+            dataLen = 0;
+            finalOffset = 0;
+            ioBuffer.Reset(); // make sure to reset the IO buffer!!
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs b/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
new file mode 100644
index 0000000..f6e9194
--- /dev/null
+++ b/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
@@ -0,0 +1,80 @@
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Util
+{
+    public abstract class StopwordAnalyzerBase : Analyzer
+    {
+        protected readonly CharArraySet stopwords;
+
+        protected readonly Version matchVersion;
+
+        public CharArraySet StopwordSet
+        {
+            get
+            {
+                return stopwords;
+            }
+        }
+
+        protected StopwordAnalyzerBase(Version version, CharArraySet stopwords)
+        {
+            matchVersion = version;
+            // analyzers should use char array set for stopwords!
+            this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+                .UnmodifiableSet(CharArraySet.Copy(version, stopwords));
+        }
+
+        protected StopwordAnalyzerBase(Version version)
+            : this(version, null)
+        {
+        }
+
+        protected static CharArraySet LoadStopwordSet(bool ignoreCase, Type aClass, string resource, string comment)
+        {
+            TextReader reader = null;
+            try
+            {
+                reader = IOUtils.GetDecodingReader(aClass.Assembly.GetManifestResourceStream(resource), IOUtils.CHARSET_UTF_8);
+                return WordlistLoader.GetWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
+            }
+            finally
+            {
+                IOUtils.Close(reader);
+            }
+        }
+
+        protected static CharArraySet LoadStopwordSet(Stream stopwords, Version matchVersion)
+        {
+            TextReader reader = null;
+            try
+            {
+                reader = IOUtils.GetDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
+                return WordlistLoader.GetWordSet(reader, matchVersion);
+            }
+            finally
+            {
+                IOUtils.Close(reader);
+            }
+        }
+
+        protected static CharArraySet LoadStopwordSet(TextReader stopwords, Version matchVersion)
+        {
+            try
+            {
+                return WordlistLoader.GetWordSet(stopwords, matchVersion);
+            }
+            finally
+            {
+                IOUtils.Close(stopwords);
+            }
+        }
+
+        public abstract override Analyzer.TokenStreamComponents CreateComponents(string fieldName, System.IO.TextReader reader);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/TokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/TokenizerFactory.cs b/src/contrib/Analyzers/Util/TokenizerFactory.cs
index 2fb600f..ea6892d 100644
--- a/src/contrib/Analyzers/Util/TokenizerFactory.cs
+++ b/src/contrib/Analyzers/Util/TokenizerFactory.cs
@@ -1,5 +1,6 @@
 using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Linq;
 using System.Text;
 
@@ -7,5 +8,42 @@ namespace Lucene.Net.Analysis.Util
 {
     public abstract class TokenizerFactory : AbstractAnalysisFactory
     {
+        private static readonly AnalysisSPILoader<TokenizerFactory> loader =
+            new AnalysisSPILoader<TokenizerFactory>(typeof(TokenizerFactory));
+
+        public static TokenizerFactory ForName(String name, IDictionary<String, String> args)
+        {
+            return loader.NewInstance(name, args);
+        }
+
+        public static Type LookupClass(String name)
+        {
+            return loader.LookupClass(name);
+        }
+
+        public static ICollection<String> AvailableTokenizers
+        {
+            get
+            {
+                return loader.AvailableServices;
+            }
+        }
+
+        public static void ReloadTokenizers()
+        {
+            loader.Reload();
+        }
+
+        protected TokenizerFactory(IDictionary<String, String> args)
+            : base(args)
+        {
+        }
+
+        public Tokenizer Create(TextReader input)
+        {
+            return Create(Lucene.Net.Util.AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input);
+        }
+
+        public abstract Tokenizer Create(Lucene.Net.Util.AttributeSource.AttributeFactory factory, TextReader input);
     }
 }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/WordlistLoader.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/WordlistLoader.cs b/src/contrib/Analyzers/Util/WordlistLoader.cs
new file mode 100644
index 0000000..e78ea9b
--- /dev/null
+++ b/src/contrib/Analyzers/Util/WordlistLoader.cs
@@ -0,0 +1,155 @@
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Util
+{
+    public static class WordlistLoader
+    {
+        private const int INITIAL_CAPACITY = 16;
+
+        public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
+        {
+            //BufferedReader br = null;
+            try
+            {
+                //br = getBufferedReader(reader);
+                String word = null;
+                while ((word = reader.ReadLine()) != null)
+                {
+                    result.Add(word.Trim());
+                }
+            }
+            finally
+            {
+                //IOUtils.Close(reader);
+            }
+            return result;
+        }
+
+        public static CharArraySet GetWordSet(TextReader reader, Lucene.Net.Util.Version matchVersion)
+        {
+            return GetWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+        }
+
+        public static CharArraySet GetWordSet(TextReader reader, String comment, Lucene.Net.Util.Version matchVersion)
+        {
+            return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+        }
+
+        public static CharArraySet GetWordSet(TextReader reader, String comment, CharArraySet result)
+        {
+            //BufferedReader br = null;
+            try
+            {
+                //br = getBufferedReader(reader);
+                String word = null;
+                while ((word = reader.ReadLine()) != null)
+                {
+                    if (word.StartsWith(comment) == false)
+                    {
+                        result.Add(word.Trim());
+                    }
+                }
+            }
+            finally
+            {
+                //IOUtils.Close(reader);
+            }
+            return result;
+        }
+
+        public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result)
+        {
+            //BufferedReader br = null;
+            try
+            {
+                //br = getBufferedReader(reader);
+                String line = null;
+                var rx = new Regex("\\s+");
+                while ((line = reader.ReadLine()) != null)
+                {
+                    int comment = line.IndexOf('|');
+                    if (comment >= 0) line = line.Substring(0, comment);
+                    String[] words = rx.Split(line);
+                    for (int i = 0; i < words.Length; i++)
+                        if (words[i].Length > 0) result.Add(words[i]);
+                }
+            }
+            finally
+            {
+                //IOUtils.Close(reader);
+            }
+            return result;
+        }
+
+        public static CharArraySet GetSnowballWordSet(TextReader reader, Lucene.Net.Util.Version matchVersion)
+        {
+            return GetSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+        }
+
+        public static CharArrayMap<String> GetStemDict(TextReader reader, CharArrayMap<String> result)
+        {
+            //BufferedReader br = null;
+            try
+            {
+                //br = getBufferedReader(reader);
+                String line;
+                var rx = new Regex("\t");
+                while ((line = reader.ReadLine()) != null)
+                {
+                    String[] wordstem = rx.Split(line, 2);
+                    result.Put(wordstem[0], wordstem[1]);
+                }
+            }
+            finally
+            {
+                //IOUtils.Close(reader);
+            }
+            return result;
+        }
+
+        public static IList<String> GetLines(Stream stream, Encoding charset)
+        {
+            TextReader input = null;
+            List<String> lines;
+            bool success = false;
+            try
+            {
+                input = IOUtils.GetDecodingReader(stream, charset);
+
+                lines = new List<String>();
+                for (String word = null; (word = input.ReadLine()) != null; )
+                {
+                    // skip initial bom marker
+                    if (lines.Count == 0 && word.Length > 0 && word[0] == '\uFEFF')
+                        word = word.Substring(1);
+                    // skip comments
+                    if (word.StartsWith("#")) continue;
+                    word = word.Trim();
+                    // skip blank lines
+                    if (word.Length == 0) continue;
+                    lines.Add(word);
+                }
+                success = true;
+                return lines;
+            }
+            finally
+            {
+                if (success)
+                {
+                    IOUtils.Close(input);
+                }
+                else
+                {
+                    IOUtils.CloseWhileHandlingException((IDisposable)input);
+                }
+            }
+        }
+
+    }
+}