You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2013/09/24 20:33:11 UTC
[35/50] [abbrv] git commit: Some work on Analyzers library
Some work on Analyzers library
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/4cc8ff0e
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/4cc8ff0e
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/4cc8ff0e
Branch: refs/heads/branch_4x
Commit: 4cc8ff0e445b942ee9eb6e0b4db5be60745d047e
Parents: 401752b
Author: Paul Irwin <pa...@gmail.com>
Authored: Wed Aug 7 13:38:11 2013 -0400
Committer: Paul Irwin <pa...@gmail.com>
Committed: Wed Aug 7 13:38:11 2013 -0400
----------------------------------------------------------------------
src/contrib/Analyzers/Contrib.Analyzers.csproj | 7 +
.../Analyzers/Core/KeywordTokenizerFactory.cs | 15 +-
src/contrib/Analyzers/Core/LetterTokenizer.cs | 28 ++
.../Analyzers/Core/LetterTokenizerFactory.cs | 27 ++
src/contrib/Analyzers/Support/AbstractSet.cs | 120 +++++++
src/contrib/Analyzers/Util/AnalysisSPILoader.cs | 115 +++++++
src/contrib/Analyzers/Util/CharArrayMap.cs | 311 ++++++++++++++++---
src/contrib/Analyzers/Util/CharArraySet.cs | 122 +++++++-
src/contrib/Analyzers/Util/CharTokenizer.cs | 124 ++++++++
.../Analyzers/Util/StopwordAnalyzerBase.cs | 80 +++++
src/contrib/Analyzers/Util/TokenizerFactory.cs | 38 +++
src/contrib/Analyzers/Util/WordlistLoader.cs | 155 +++++++++
12 files changed, 1094 insertions(+), 48 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Contrib.Analyzers.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index 8b01198..8613c88 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -132,6 +132,8 @@
<Compile Include="Core\KeywordAnalyzer.cs" />
<Compile Include="Core\KeywordTokenizer.cs" />
<Compile Include="Core\KeywordTokenizerFactory.cs" />
+ <Compile Include="Core\LetterTokenizer.cs" />
+ <Compile Include="Core\LetterTokenizerFactory.cs" />
<Compile Include="Cz\CzechAnalyzer.cs" />
<Compile Include="De\GermanAnalyzer.cs" />
<Compile Include="De\GermanStemFilter.cs" />
@@ -199,15 +201,20 @@
<Compile Include="Sinks\DateRecognizerSinkFilter.cs" />
<Compile Include="Sinks\TokenRangeSinkFilter.cs" />
<Compile Include="Sinks\TokenTypeSinkFilter.cs" />
+ <Compile Include="Support\AbstractSet.cs" />
<Compile Include="Support\StringExtensions.cs" />
<Compile Include="Th\ThaiAnalyzer.cs" />
<Compile Include="Th\ThaiWordFilter.cs" />
<Compile Include="Util\AbstractAnalysisFactory.cs" />
+ <Compile Include="Util\AnalysisSPILoader.cs" />
<Compile Include="Util\CharacterUtils.cs" />
<Compile Include="Util\CharArrayMap.cs" />
<Compile Include="Util\CharArraySet.cs" />
+ <Compile Include="Util\CharTokenizer.cs" />
<Compile Include="Util\IResourceLoader.cs" />
+ <Compile Include="Util\StopwordAnalyzerBase.cs" />
<Compile Include="Util\TokenizerFactory.cs" />
+ <Compile Include="Util\WordlistLoader.cs" />
<Compile Include="WordlistLoader.cs" />
</ItemGroup>
<ItemGroup>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs b/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
index 24f810c..ac85065 100644
--- a/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
+++ b/src/contrib/Analyzers/Core/KeywordTokenizerFactory.cs
@@ -1,4 +1,5 @@
-using System;
+using Lucene.Net.Analysis.Util;
+using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
@@ -7,6 +8,18 @@ namespace Lucene.Net.Analysis.Core
{
public class KeywordTokenizerFactory : TokenizerFactory
{
+ public KeywordTokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+ public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+ {
+ return new KeywordTokenizer(factory, input, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Core/LetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LetterTokenizer.cs b/src/contrib/Analyzers/Core/LetterTokenizer.cs
new file mode 100644
index 0000000..669d8dc
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LetterTokenizer.cs
@@ -0,0 +1,28 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class LetterTokenizer : CharTokenizer
+ {
+ public LetterTokenizer(Version matchVersion, TextReader input)
+ : base(matchVersion, input)
+ {
+ }
+
+ public LetterTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
+ : base(matchVersion, factory, input)
+ {
+ }
+
+ protected override bool IsTokenChar(int c)
+ {
+ return char.IsLetter((char)c);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs b/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs
new file mode 100644
index 0000000..c07a8b6
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LetterTokenizerFactory.cs
@@ -0,0 +1,27 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class LetterTokenizerFactory : TokenizerFactory
+ {
+ public LetterTokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+ {
+ return new LetterTokenizer(luceneMatchVersion, factory, input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Support/AbstractSet.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Support/AbstractSet.cs b/src/contrib/Analyzers/Support/AbstractSet.cs
new file mode 100644
index 0000000..f732d08
--- /dev/null
+++ b/src/contrib/Analyzers/Support/AbstractSet.cs
@@ -0,0 +1,120 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Support
+{
+ public class AbstractSet<T> : ISet<T>
+ {
+ public virtual bool Add(T item)
+ {
+ return false;
+ }
+
+ public void ExceptWith(IEnumerable<T> other)
+ {
+ foreach (var item in other)
+ {
+ this.Remove(item);
+ }
+ }
+
+ public void IntersectWith(IEnumerable<T> other)
+ {
+ var set = new HashSet<T>(other);
+
+ foreach (var item in this.ToList())
+ {
+ if (!set.Contains(item))
+ this.Remove(item);
+ }
+ }
+
+ public bool IsProperSubsetOf(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public bool IsProperSupersetOf(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public bool IsSubsetOf(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public bool IsSupersetOf(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public bool Overlaps(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public bool SetEquals(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public void SymmetricExceptWith(IEnumerable<T> other)
+ {
+ throw new NotImplementedException();
+ }
+
+ public void UnionWith(IEnumerable<T> other)
+ {
+ foreach (var item in other)
+ {
+ this.Add(item);
+ }
+ }
+
+ void ICollection<T>.Add(T item)
+ {
+ Add(item);
+ }
+
+ public abstract void Clear();
+
+ public abstract bool Contains(T item);
+
+ public void CopyTo(T[] array, int arrayIndex)
+ {
+ var enumerator = GetEnumerator();
+
+ for (int i = arrayIndex; i < array.Length; i++)
+ {
+ if (!enumerator.MoveNext())
+ break;
+
+ array[i] = enumerator.Current;
+ }
+ }
+
+ public abstract int Count { get; }
+
+ public bool IsReadOnly
+ {
+ get { return false ; }
+ }
+
+ public abstract bool Remove(T item);
+
+ public abstract IEnumerator<T> GetEnumerator();
+
+ System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
+ {
+ return GetEnumerator();
+ }
+
+ public void AddAll(IEnumerable<T> values)
+ {
+ this.UnionWith(values);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/AnalysisSPILoader.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/AnalysisSPILoader.cs b/src/contrib/Analyzers/Util/AnalysisSPILoader.cs
new file mode 100644
index 0000000..d06a57c
--- /dev/null
+++ b/src/contrib/Analyzers/Util/AnalysisSPILoader.cs
@@ -0,0 +1,115 @@
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading;
+
+namespace Lucene.Net.Analysis.Util
+{
+ internal sealed class AnalysisSPILoader<S>
+ where S : AbstractAnalysisFactory
+ {
+ private volatile IDictionary<string, Type> services = new HashMap<string, Type>();
+ private readonly Type clazz;
+ private readonly string[] suffixes;
+
+ public AnalysisSPILoader(Type clazz)
+ : this(clazz, new string[] { clazz.Name })
+ {
+ }
+
+ public AnalysisSPILoader(Type clazz, string[] suffixes)
+ {
+ this.clazz = clazz;
+ this.suffixes = suffixes;
+ // if clazz' classloader is not a parent of the given one, we scan clazz's classloader, too:
+ //final ClassLoader clazzClassloader = clazz.getClassLoader();
+ //if (clazzClassloader != null && !SPIClassIterator.isParentClassLoader(clazzClassloader, classloader)) {
+ // reload(clazzClassloader);
+ //}
+ Reload();
+ }
+
+ public void Reload()
+ {
+ lock (this)
+ {
+ HashMap<String, Type> services =
+ new HashMap<String, Type>(this.services);
+ SPIClassIterator<S> loader = SPIClassIterator<S>.Get();
+ foreach (var service in loader)
+ {
+ //Class<? extends S> service = loader.next();
+ String clazzName = service.Name;
+ String name = null;
+ foreach (String suffix in suffixes)
+ {
+ if (clazzName.EndsWith(suffix))
+ {
+ name = clazzName.Substring(0, clazzName.Length - suffix.Length).ToLowerInvariant();
+ break;
+ }
+ }
+ if (name == null)
+ {
+ throw new InvalidOperationException("The class name " + service.FullName +
+ " has wrong suffix, allowed are: " + Arrays.ToString(suffixes));
+ }
+ // only add the first one for each name, later services will be ignored
+ // this allows to place services before others in classpath to make
+ // them used instead of others
+ //
+ // TODO: Should we disallow duplicate names here?
+ // Allowing it may get confusing on collisions, as different packages
+ // could contain same factory class, which is a naming bug!
+ // When changing this be careful to allow reload()!
+ if (!services.ContainsKey(name))
+ {
+ services[name] = service;
+ }
+ }
+ //this.services = Collections.unmodifiableMap(services);
+ }
+ }
+
+ public S NewInstance(string name, IDictionary<string, string> args)
+ {
+ Type service = LookupClass(name);
+ try
+ {
+ //var ctor = service.GetConstructor(new[] { typeof(IDictionary<string, string>) });
+ return (S)Activator.CreateInstance(service, args);
+ }
+ catch (Exception e)
+ {
+ throw new ArgumentException("SPI class of type " + clazz.FullName + " with name '" + name + "' cannot be instantiated. " +
+ "This is likely due to a misconfiguration of the java class '" + service.FullName + "': ", e);
+ }
+ }
+
+ public Type LookupClass(String name)
+ {
+ Type service = services[name.ToLowerInvariant()];
+ if (service != null)
+ {
+ return service;
+ }
+ else
+ {
+ throw new ArgumentException("A SPI class of type " + clazz.FullName + " with name '" + name + "' does not exist. " +
+ "You need to add the corresponding JAR file supporting this SPI to your classpath." +
+ "The current classpath supports the following names: " + Arrays.ToString(AvailableServices));
+ }
+ }
+
+ public ICollection<String> AvailableServices
+ {
+ get
+ {
+ return services.Keys;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/CharArrayMap.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharArrayMap.cs b/src/contrib/Analyzers/Util/CharArrayMap.cs
index 7297855..e124451 100644
--- a/src/contrib/Analyzers/Util/CharArrayMap.cs
+++ b/src/contrib/Analyzers/Util/CharArrayMap.cs
@@ -1,4 +1,5 @@
-using Lucene.Net.Support;
+using Lucene.Net.Analysis.Support;
+using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.Linq;
@@ -9,7 +10,7 @@ namespace Lucene.Net.Analysis.Util
public class CharArrayMap<V> : IDictionary<object, V>
{
// private only because missing generics
- private static readonly CharArrayMap<V> EMPTY_MAP = new EmptyCharArrayMap<Object>();
+ internal static readonly CharArrayMap<V> EMPTY_MAP = new CharArrayMap.EmptyCharArrayMap<V>();
private const int INIT_SIZE = 8;
private readonly CharacterUtils charUtils;
@@ -34,10 +35,13 @@ namespace Lucene.Net.Analysis.Util
public CharArrayMap(Lucene.Net.Util.Version matchVersion, IDictionary<object, V> c, bool ignoreCase)
: this(matchVersion, c.Count, ignoreCase)
{
- PutAll(c);
+ foreach (var kvp in c)
+ {
+ this[kvp.Key] = kvp.Value;
+ }
}
- private CharArrayMap(CharArrayMap<V> toCopy)
+ internal CharArrayMap(CharArrayMap<V> toCopy)
{
this.keys = toCopy.keys;
this.values = toCopy.values;
@@ -47,24 +51,24 @@ namespace Lucene.Net.Analysis.Util
this.matchVersion = toCopy.matchVersion;
}
- public void Clear()
+ public virtual void Clear()
{
count = 0;
Arrays.Fill(keys, null);
Arrays.Fill(values, default(V));
}
- public bool ContainsKey(char[] text, int off, int len)
+ public virtual bool ContainsKey(char[] text, int off, int len)
{
return keys[GetSlot(text, off, len)] != null;
}
- public bool ContainsKey(ICharSequence cs)
+ public virtual bool ContainsKey(ICharSequence cs)
{
return keys[GetSlot(cs)] != null;
}
- public bool ContainsKey(Object o)
+ public virtual bool ContainsKey(Object o)
{
if (o is char[])
{
@@ -74,34 +78,35 @@ namespace Lucene.Net.Analysis.Util
return ContainsKey(o.ToString());
}
- public V Get(char[] text, int off, int len)
+ public virtual V Get(char[] text, int off, int len)
{
return values[GetSlot(text, off, len)];
}
- public V Get(ICharSequence cs)
+ public virtual V Get(ICharSequence cs)
{
return values[GetSlot(cs)];
}
+ public virtual V Get(object o)
+ {
+ if (o is char[])
+ {
+ char[] text = (char[])o;
+ return Get(text, 0, text.Length);
+ }
+ return Get(o.ToString());
+ }
+
public V this[Object o]
{
get
{
- if (o is char[])
- {
- char[] text = (char[])o;
- return Get(text, 0, text.Length);
- }
- return this[o.ToString()];
+ return Get(o);
}
set
{
- if (o is char[])
- {
- Put((char[])o, value);
- }
- Put(o.ToString(), value);
+ Put(o, value);
}
}
@@ -141,17 +146,26 @@ namespace Lucene.Net.Analysis.Util
return pos;
}
- public V Put(ICharSequence text, V value)
+ public virtual V Put(object o, V value)
+ {
+ if (o is char[])
+ {
+ return Put((char[])o, value);
+ }
+ return Put(o.ToString(), value);
+ }
+
+ public virtual V Put(ICharSequence text, V value)
{
return Put(text.ToString(), value); // could be more efficient
}
- public V Put(string text, V value)
+ public virtual V Put(string text, V value)
{
return Put(text.ToCharArray(), value);
}
- public V Put(char[] text, V value)
+ public virtual V Put(char[] text, V value)
{
if (ignoreCase)
{
@@ -300,7 +314,7 @@ namespace Lucene.Net.Analysis.Util
return code;
}
- public void Remove(object key)
+ public virtual void Remove(object key)
{
throw new NotSupportedException();
}
@@ -313,7 +327,7 @@ namespace Lucene.Net.Analysis.Util
public override string ToString()
{
StringBuilder sb = new StringBuilder("{");
- foreach (KeyValuePair<Object, V> entry in EntrySet)
+ foreach (KeyValuePair<Object, V> entry in this.GetEntrySet())
{
if (sb.Length > 1) sb.Append(", ");
sb.Append(entry);
@@ -324,28 +338,25 @@ namespace Lucene.Net.Analysis.Util
private EntrySet entrySet = null;
private CharArraySet keySet = null;
- internal EntrySet CreateEntrySet()
+ internal virtual EntrySet CreateEntrySet()
{
- return new EntrySet(true);
+ return new EntrySet(this, true);
}
- public EntrySet EntrySet
+ public EntrySet GetEntrySet()
{
- get
+ if (entrySet == null)
{
- if (entrySet == null)
- {
- entrySet = CreateEntrySet();
- }
- return entrySet;
+ entrySet = CreateEntrySet();
}
+ return entrySet;
}
internal ISet<object> OriginalKeySet
{
get
{
- return Keys;
+ return Keys as ISet<object>;
}
}
@@ -399,7 +410,7 @@ namespace Lucene.Net.Analysis.Util
private int lastPos;
private readonly bool allowModify;
- private KeyValuePair<object, V> current; // .NET Port: need to store current as IEnumerator != Iterator
+ private MapEntry current; // .NET Port: need to store current as IEnumerator != Iterator
public EntryIterator(CharArrayMap<V> parent, bool allowModify)
{
@@ -423,7 +434,7 @@ namespace Lucene.Net.Analysis.Util
return true;
}
- current = new MapEntry(lastPos, allowModify);
+ current = new MapEntry(parent, lastPos, allowModify);
return false;
}
@@ -454,10 +465,10 @@ namespace Lucene.Net.Analysis.Util
parent.values[lastPos] = value;
return old;
}
-
+
public KeyValuePair<object, V> Current
{
- get { return current; }
+ get { return current.AsKeyValuePair(); }
}
public void Dispose()
@@ -475,6 +486,224 @@ namespace Lucene.Net.Analysis.Util
}
}
-
+ private sealed class MapEntry // : KeyValuePair<object, V> -- this doesn't work in .NET as KVP is a struct, so we wrap it instead
+ {
+ private readonly CharArrayMap<V> parent;
+ private readonly int pos;
+ private readonly bool allowModify;
+
+ public MapEntry(CharArrayMap<V> parent, int pos, bool allowModify)
+ {
+ this.parent = parent;
+ this.pos = pos;
+ this.allowModify = allowModify;
+ }
+
+ public object Key
+ {
+ get
+ {
+ // we must clone here, as putAll to another CharArrayMap
+ // with other case sensitivity flag would corrupt the keys
+ return parent.keys[pos].Clone();
+ }
+ }
+
+ public V Value
+ {
+ get
+ {
+ return parent.values[pos];
+ }
+ set
+ {
+ if (!allowModify)
+ throw new NotSupportedException();
+
+ parent.values[pos] = value;
+ }
+ }
+
+ public override string ToString()
+ {
+ return new StringBuilder().Append(parent.keys[pos]).Append('=')
+ .Append((parent.values[pos].Equals(parent)) ? "(this Map)" : parent.values[pos].ToString())
+ .ToString();
+ }
+
+ public KeyValuePair<object, V> AsKeyValuePair()
+ {
+ return new KeyValuePair<object, V>(Key, Value);
+ }
+ }
+
+ public sealed class EntrySet : AbstractSet<KeyValuePair<object, V>>
+ {
+ private readonly CharArrayMap<V> parent;
+ private readonly bool allowModify;
+
+ public EntrySet(CharArrayMap<V> parent, bool allowModify)
+ {
+ this.parent = parent;
+ this.allowModify = allowModify;
+ }
+
+ public override IEnumerator<KeyValuePair<object, V>> GetEnumerator()
+ {
+ return new EntryIterator(parent, allowModify);
+ }
+
+ public override bool Contains(KeyValuePair<object, V> e)
+ {
+ //if (!(o instanceof Map.Entry))
+ // return false;
+ //Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
+ Object key = e.Key;
+ Object val = e.Value;
+ Object v = parent[key];
+ return v == null ? val == null : v.Equals(val);
+ }
+
+ public override bool Remove(KeyValuePair<object, V> item)
+ {
+ throw new NotSupportedException();
+ }
+
+ public override int Count
+ {
+ get { return parent.count; }
+ }
+
+ public override void Clear()
+ {
+ if (!allowModify)
+ throw new NotSupportedException();
+ parent.Clear();
+ }
+ }
+ }
+
+ // .NET Port: non-generic static clas to hold nested types and static methods
+ public static class CharArrayMap
+ {
+ public static CharArrayMap<V> UnmodifiableMap<V>(CharArrayMap<V> map)
+ {
+ if (map == null)
+ throw new NullReferenceException("Given map is null");
+ if (map == EmptyMap<V>() || map.Count == 0)
+ return EmptyMap<V>();
+ if (map is UnmodifiableCharArrayMap<V>)
+ return map;
+ return new UnmodifiableCharArrayMap<V>(map);
+ }
+
+ public static CharArrayMap<V> Copy<V>(Lucene.Net.Util.Version matchVersion, IDictionary<object, V> map)
+ {
+ if (map == CharArrayMap<V>.EMPTY_MAP)
+ return EmptyMap<V>();
+ if (map is CharArrayMap<V>)
+ {
+ CharArrayMap<V> m = (CharArrayMap<V>)map;
+ // use fast path instead of iterating all values
+ // this is even on very small sets ~10 times faster than iterating
+ char[][] keys = new char[m.keys.Length][];
+ Array.Copy(m.keys, 0, keys, 0, keys.Length);
+ V[] values = new V[m.values.Length];
+ Array.Copy(m.values, 0, values, 0, values.Length);
+ m = new CharArrayMap<V>(m);
+ m.keys = keys;
+ m.values = values;
+ return m;
+ }
+ return new CharArrayMap<V>(matchVersion, map, false);
+ }
+
+ public static CharArrayMap<V> EmptyMap<V>()
+ {
+ return CharArrayMap<V>.EMPTY_MAP;
+ }
+
+ internal class UnmodifiableCharArrayMap<V> : CharArrayMap<V>
+ {
+ public UnmodifiableCharArrayMap(CharArrayMap<V> map)
+ : base(map)
+ {
+ }
+
+ public override void Clear()
+ {
+ throw new NotSupportedException();
+ }
+
+ public override V Put(char[] text, V value)
+ {
+ throw new NotSupportedException();
+ }
+
+ public override V Put(ICharSequence text, V value)
+ {
+ throw new NotSupportedException();
+ }
+
+ public override V Put(string text, V value)
+ {
+ throw new NotSupportedException();
+ }
+
+ public override void Remove(object key)
+ {
+ throw new NotSupportedException();
+ }
+
+ internal override CharArrayMap<V>.EntrySet CreateEntrySet()
+ {
+ throw new NotSupportedException();
+ }
+ }
+
+ internal sealed class EmptyCharArrayMap<V> : UnmodifiableCharArrayMap<V>
+ {
+ public EmptyCharArrayMap()
+ : base(new CharArrayMap<V>(Lucene.Net.Util.Version.LUCENE_CURRENT, 0, false))
+ {
+ }
+
+ public override bool ContainsKey(char[] text, int off, int len)
+ {
+ if (text == null)
+ throw new NullReferenceException();
+ return false;
+ }
+
+ public override bool ContainsKey(ICharSequence cs)
+ {
+ if (cs == null)
+ throw new NullReferenceException();
+ return false;
+ }
+
+ public override bool ContainsKey(object o)
+ {
+ if (o == null)
+ throw new NullReferenceException();
+ return false;
+ }
+
+ public override V Get(char[] text, int off, int len)
+ {
+ if (text == null)
+ throw new NullReferenceException();
+ return default(V);
+ }
+
+ public override V Get(ICharSequence cs)
+ {
+ if (cs == null)
+ throw new NullReferenceException();
+ return default(V);
+ }
+
+
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/CharArraySet.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharArraySet.cs b/src/contrib/Analyzers/Util/CharArraySet.cs
index 6dcc486..522bcaa 100644
--- a/src/contrib/Analyzers/Util/CharArraySet.cs
+++ b/src/contrib/Analyzers/Util/CharArraySet.cs
@@ -1,15 +1,125 @@
-using System;
+using Lucene.Net.Analysis.Support;
+using Lucene.Net.Support;
+using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Lucene.Net.Analysis.Util
{
- public class CharArraySet : ISet<object>
+ public class CharArraySet : AbstractSet<object>
{
- public static readonly CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
- private static readonly object PLACEHOLDER = new object();
-
- private readonly CharArrayMap<Object> map;
+ public static readonly CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.EmptyMap<object>());
+ private static readonly object PLACEHOLDER = new object();
+
+ private readonly CharArrayMap<object> map;
+
+ public CharArraySet(Lucene.Net.Util.Version matchVersion, int startSize, bool ignoreCase)
+ : this(new CharArrayMap<Object>(matchVersion, startSize, ignoreCase))
+ {
+ }
+
+ public CharArraySet(Lucene.Net.Util.Version matchVersion, ICollection<object> c, bool ignoreCase)
+ : this(matchVersion, c.Count, ignoreCase)
+ {
+ AddAll(c);
+ }
+
+ internal CharArraySet(CharArrayMap<Object> map)
+ {
+ this.map = map;
+ }
+
+ public override void Clear()
+ {
+ map.Clear();
+ }
+
+ public bool Contains(char[] text, int off, int len)
+ {
+ return map.ContainsKey(text, off, len);
+ }
+
+ public bool Contains(ICharSequence cs)
+ {
+ return map.ContainsKey(cs);
+ }
+
+ public override bool Contains(object o)
+ {
+ return map.ContainsKey(o);
+ }
+
+ public override bool Add(object o)
+ {
+ return map.Put(o, PLACEHOLDER) == null;
+ }
+
+ public bool Add(ICharSequence text)
+ {
+ return map.Put(text, PLACEHOLDER) == null;
+ }
+
+ public bool Add(string text)
+ {
+ return map.Put(text, PLACEHOLDER) == null;
+ }
+
+ public bool Add(char[] text)
+ {
+ return map.Put(text, PLACEHOLDER) == null;
+ }
+
+ public override int Count
+ {
+ get { return map.Count; }
+ }
+
+ public static CharArraySet UnmodifiableSet(CharArraySet set)
+ {
+ if (set == null)
+ throw new NullReferenceException("Given set is null");
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ if (set.map is CharArrayMap.UnmodifiableCharArrayMap<object>)
+ return set;
+ return new CharArraySet(CharArrayMap.UnmodifiableMap(set.map));
+ }
+
+ public static CharArraySet Copy(Lucene.Net.Util.Version matchVersion, ICollection<object> set)
+ {
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ if (set is CharArraySet)
+ {
+ CharArraySet source = (CharArraySet)set;
+ return new CharArraySet(CharArrayMap.Copy(source.map.matchVersion, source.map));
+ }
+ return new CharArraySet(matchVersion, set, false);
+ }
+
+ public override IEnumerator<object> GetEnumerator()
+ {
+ // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+ return map.OriginalKeySet.GetEnumerator();
+ }
+
+ public override string ToString()
+ {
+ StringBuilder sb = new StringBuilder("[");
+ foreach (Object item in this)
+ {
+ if (sb.Length > 1) sb.Append(", ");
+ if (item is char[])
+ {
+ sb.Append((char[])item);
+ }
+ else
+ {
+ sb.Append(item);
+ }
+ }
+ return sb.Append(']').ToString();
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/CharTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharTokenizer.cs b/src/contrib/Analyzers/Util/CharTokenizer.cs
new file mode 100644
index 0000000..b0029fa
--- /dev/null
+++ b/src/contrib/Analyzers/Util/CharTokenizer.cs
@@ -0,0 +1,124 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public abstract class CharTokenizer : Tokenizer
+ {
+ public CharTokenizer(Version matchVersion, TextReader input)
+ : base(input)
+ {
+ charUtils = CharacterUtils.GetInstance(matchVersion);
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ }
+
+ public CharTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
+ : base(factory, input)
+ {
+ charUtils = CharacterUtils.GetInstance(matchVersion);
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ }
+
+ // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
+ private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
+ private const int MAX_WORD_LEN = 255;
+ private const int IO_BUFFER_SIZE = 4096;
+
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+ private readonly IOffsetAttribute offsetAtt; // = addAttribute(OffsetAttribute.class);
+
+ private readonly CharacterUtils charUtils;
+ private readonly CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.NewCharacterBuffer(IO_BUFFER_SIZE);
+
+ protected abstract bool IsTokenChar(int c);
+
+ protected virtual int Normalize(int c)
+ {
+ return c;
+ }
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int length = 0;
+ int start = -1; // this variable is always initialized
+ int end = -1;
+ char[] buffer = termAtt.Buffer;
+ while (true)
+ {
+ if (bufferIndex >= dataLen)
+ {
+ offset += dataLen;
+ if (!charUtils.Fill(ioBuffer, input))
+ { // read supplementary char aware with CharacterUtils
+ dataLen = 0; // so next offset += dataLen won't decrement offset
+ if (length > 0)
+ {
+ break;
+ }
+ else
+ {
+ finalOffset = CorrectOffset(offset);
+ return false;
+ }
+ }
+ dataLen = ioBuffer.Length;
+ bufferIndex = 0;
+ }
+ // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
+ int c = charUtils.CodePointAt(ioBuffer.Buffer, bufferIndex);
+ int charCount = Character.CharCount(c);
+ bufferIndex += charCount;
+
+ if (IsTokenChar(c))
+ { // if it's a token char
+ if (length == 0)
+ { // start of token
+ //assert start == -1;
+ start = offset + bufferIndex - charCount;
+ end = start;
+ }
+ else if (length >= buffer.Length - 1)
+ { // check if a supplementary could run out of bounds
+ buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer
+ }
+ end += charCount;
+ length += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized
+ if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
+ break;
+ }
+ else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.SetLength(length);
+ //assert start != -1;
+ offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end));
+ return true;
+
+ }
+
+ public override void End()
+ {
+ // set final offset
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ bufferIndex = 0;
+ offset = 0;
+ dataLen = 0;
+ finalOffset = 0;
+ ioBuffer.Reset(); // make sure to reset the IO buffer!!
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs b/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
new file mode 100644
index 0000000..f6e9194
--- /dev/null
+++ b/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
@@ -0,0 +1,80 @@
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public abstract class StopwordAnalyzerBase : Analyzer
+ {
+ protected readonly CharArraySet stopwords;
+
+ protected readonly Version matchVersion;
+
+ public CharArraySet StopwordSet
+ {
+ get
+ {
+ return stopwords;
+ }
+ }
+
+ protected StopwordAnalyzerBase(Version version, CharArraySet stopwords)
+ {
+ matchVersion = version;
+ // analyzers should use char array set for stopwords!
+ this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+ .UnmodifiableSet(CharArraySet.Copy(version, stopwords));
+ }
+
+ protected StopwordAnalyzerBase(Version version)
+ : this(version, null)
+ {
+ }
+
+ protected static CharArraySet LoadStopwordSet(bool ignoreCase, Type aClass, string resource, string comment)
+ {
+ TextReader reader = null;
+ try
+ {
+ reader = IOUtils.GetDecodingReader(aClass.Assembly.GetManifestResourceStream(resource), IOUtils.CHARSET_UTF_8);
+ return WordlistLoader.GetWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
+ }
+ finally
+ {
+ IOUtils.Close(reader);
+ }
+ }
+
+ protected static CharArraySet LoadStopwordSet(Stream stopwords, Version matchVersion)
+ {
+ TextReader reader = null;
+ try
+ {
+ reader = IOUtils.GetDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
+ return WordlistLoader.GetWordSet(reader, matchVersion);
+ }
+ finally
+ {
+ IOUtils.Close(reader);
+ }
+ }
+
+ protected static CharArraySet LoadStopwordSet(TextReader stopwords, Version matchVersion)
+ {
+ try
+ {
+ return WordlistLoader.GetWordSet(stopwords, matchVersion);
+ }
+ finally
+ {
+ IOUtils.Close(stopwords);
+ }
+ }
+
+ public abstract override Analyzer.TokenStreamComponents CreateComponents(string fieldName, System.IO.TextReader reader);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/TokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/TokenizerFactory.cs b/src/contrib/Analyzers/Util/TokenizerFactory.cs
index 2fb600f..ea6892d 100644
--- a/src/contrib/Analyzers/Util/TokenizerFactory.cs
+++ b/src/contrib/Analyzers/Util/TokenizerFactory.cs
@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
+using System.IO;
using System.Linq;
using System.Text;
@@ -7,5 +8,42 @@ namespace Lucene.Net.Analysis.Util
{
public abstract class TokenizerFactory : AbstractAnalysisFactory
{
+ private static readonly AnalysisSPILoader<TokenizerFactory> loader =
+ new AnalysisSPILoader<TokenizerFactory>(typeof(TokenizerFactory));
+
+ public static TokenizerFactory ForName(String name, IDictionary<String, String> args)
+ {
+ return loader.NewInstance(name, args);
+ }
+
+ public static Type LookupClass(String name)
+ {
+ return loader.LookupClass(name);
+ }
+
+ public static ICollection<String> AvailableTokenizers
+ {
+ get
+ {
+ return loader.AvailableServices;
+ }
+ }
+
+ public static void ReloadTokenizers()
+ {
+ loader.Reload();
+ }
+
+ protected TokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ }
+
+ public Tokenizer Create(TextReader input)
+ {
+ return Create(Lucene.Net.Util.AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input);
+ }
+
+ public abstract Tokenizer Create(Lucene.Net.Util.AttributeSource.AttributeFactory factory, TextReader input);
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4cc8ff0e/src/contrib/Analyzers/Util/WordlistLoader.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/WordlistLoader.cs b/src/contrib/Analyzers/Util/WordlistLoader.cs
new file mode 100644
index 0000000..e78ea9b
--- /dev/null
+++ b/src/contrib/Analyzers/Util/WordlistLoader.cs
@@ -0,0 +1,155 @@
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public static class WordlistLoader
+ {
+ private const int INITIAL_CAPACITY = 16;
+
+ public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
+ {
+ //BufferedReader br = null;
+ try
+ {
+ //br = getBufferedReader(reader);
+ String word = null;
+ while ((word = reader.ReadLine()) != null)
+ {
+ result.Add(word.Trim());
+ }
+ }
+ finally
+ {
+ //IOUtils.Close(reader);
+ }
+ return result;
+ }
+
+ public static CharArraySet GetWordSet(TextReader reader, Lucene.Net.Util.Version matchVersion)
+ {
+ return GetWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+ }
+
+ public static CharArraySet GetWordSet(TextReader reader, String comment, Lucene.Net.Util.Version matchVersion)
+ {
+ return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+ }
+
+ public static CharArraySet GetWordSet(TextReader reader, String comment, CharArraySet result)
+ {
+ //BufferedReader br = null;
+ try
+ {
+ //br = getBufferedReader(reader);
+ String word = null;
+ while ((word = reader.ReadLine()) != null)
+ {
+ if (word.StartsWith(comment) == false)
+ {
+ result.Add(word.Trim());
+ }
+ }
+ }
+ finally
+ {
+ //IOUtils.Close(reader);
+ }
+ return result;
+ }
+
+ public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result)
+ {
+ //BufferedReader br = null;
+ try
+ {
+ //br = getBufferedReader(reader);
+ String line = null;
+ var rx = new Regex("\\s+");
+ while ((line = reader.ReadLine()) != null)
+ {
+ int comment = line.IndexOf('|');
+ if (comment >= 0) line = line.Substring(0, comment);
+ String[] words = rx.Split(line);
+ for (int i = 0; i < words.Length; i++)
+ if (words[i].Length > 0) result.Add(words[i]);
+ }
+ }
+ finally
+ {
+ //IOUtils.Close(reader);
+ }
+ return result;
+ }
+
+ public static CharArraySet GetSnowballWordSet(TextReader reader, Lucene.Net.Util.Version matchVersion)
+ {
+ return GetSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+ }
+
+ public static CharArrayMap<String> GetStemDict(TextReader reader, CharArrayMap<String> result)
+ {
+ //BufferedReader br = null;
+ try
+ {
+ //br = getBufferedReader(reader);
+ String line;
+ var rx = new Regex("\t");
+ while ((line = reader.ReadLine()) != null)
+ {
+ String[] wordstem = rx.Split(line, 2);
+ result.Put(wordstem[0], wordstem[1]);
+ }
+ }
+ finally
+ {
+ //IOUtils.Close(reader);
+ }
+ return result;
+ }
+
+ public static IList<String> GetLines(Stream stream, Encoding charset)
+ {
+ TextReader input = null;
+ List<String> lines;
+ bool success = false;
+ try
+ {
+ input = IOUtils.GetDecodingReader(stream, charset);
+
+ lines = new List<String>();
+ for (String word = null; (word = input.ReadLine()) != null; )
+ {
+ // skip initial bom marker
+ if (lines.Count == 0 && word.Length > 0 && word[0] == '\uFEFF')
+ word = word.Substring(1);
+ // skip comments
+ if (word.StartsWith("#")) continue;
+ word = word.Trim();
+ // skip blank lines
+ if (word.Length == 0) continue;
+ lines.Add(word);
+ }
+ success = true;
+ return lines;
+ }
+ finally
+ {
+ if (success)
+ {
+ IOUtils.Close(input);
+ }
+ else
+ {
+ IOUtils.CloseWhileHandlingException((IDisposable)input);
+ }
+ }
+ }
+
+ }
+}