You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2011/11/21 05:44:59 UTC
[Lucene.Net] svn commit: r1204353 [7/9] - in
/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src:
contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/
contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/
contrib/Analyzers/Compoun...
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,44 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Makes the Token.Type() a payload.
+ /// Encodes the type using <see cref="System.Text.Encoding.UTF8"/> as the encoding
+ /// </summary>
+ public class TypeAsPayloadTokenFilter : TokenFilter
+ {
+ private PayloadAttribute payloadAtt;
+ private TypeAttribute typeAtt;
+
+ public TypeAsPayloadTokenFilter(TokenStream input)
+ : base(input)
+ {
+ payloadAtt = AddAttribute<PayloadAttribute>();
+ typeAtt = AddAttribute<TypeAttribute>();
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String type = typeAtt.Type();
+ if (type != null && type.Equals("") == false)
+ {
+ payloadAtt.SetPayload(new Payload(Encoding.UTF8.GetBytes(type)));
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,76 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis.Position
+{
+ /** Set the positionIncrement of all tokens to the "positionIncrement",
+ * except the first return token which retains its original positionIncrement value.
+ * The default positionIncrement value is zero.
+ */
+ public sealed class PositionFilter : TokenFilter
+ {
+
+ /** Position increment to assign to all but the first token - default = 0 */
+ private int positionIncrement = 0;
+
+ /** The first token must have non-zero positionIncrement **/
+ private bool firstTokenPositioned = false;
+
+ private PositionIncrementAttribute posIncrAtt;
+
+ /**
+ * Constructs a PositionFilter that assigns a position increment of zero to
+ * all but the first token from the given input stream.
+ *
+ * @param input the input stream
+ */
+ public PositionFilter(TokenStream input)
+ : base(input)
+ {
+ posIncrAtt = AddAttribute<PositionIncrementAttribute>();
+ }
+
+ /**
+ * Constructs a PositionFilter that assigns the given position increment to
+ * all but the first token from the given input stream.
+ *
+ * @param input the input stream
+ * @param positionIncrement position increment to assign to all but the first
+ * token from the input stream
+ */
+ public PositionFilter(TokenStream input, int positionIncrement)
+ : this(input)
+ {
+ this.positionIncrement = positionIncrement;
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ if (firstTokenPositioned)
+ {
+ posIncrAtt.SetPositionIncrement(positionIncrement);
+ }
+ else
+ {
+ firstTokenPositioned = true;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ firstTokenPositioned = false;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,279 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Query
+{
+/**
+ * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
+ * which prevents very common words from being passed into queries.
+ * <p>
+ * For very large indexes the cost
+ * of reading TermDocs for a very common word can be high. This analyzer was created after experience with
+ * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
+ * this term to take 2 seconds.
+ * </p>
+ * <p>
+ * Use the various "addStopWords" methods in this class to automate the identification and addition of
+ * stop words found in an already existing index.
+ * </p>
+ */
+public class QueryAutoStopWordAnalyzer : Analyzer {
+ Analyzer _delegate;
+ HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
+ //The default maximum percentage (40%) of index documents which
+ //can contain a term, after which the term is considered to be a stop word.
+ public const float defaultMaxDocFreqPercent = 0.4f;
+ private readonly Version matchVersion;
+
+ /**
+ * Initializes this analyzer with the Analyzer object that actually produces the tokens
+ *
+ * @param _delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
+ */
+ public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer _delegate)
+ {
+ this._delegate = _delegate;
+ SetOverridesTokenStreamMethod(typeof(QueryAutoStopWordAnalyzer));
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
+ *
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
+ * exceed the required document frequency
+ * @return The number of stop words identified.
+ * @throws IOException
+ */
+ public int AddStopWords(IndexReader reader)
+ {
+ return AddStopWords(reader, defaultMaxDocFreqPercent);
+ }
+
+ /**
+ * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
+ *
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
+ * exceed the required document frequency
+ * @param maxDocFreq The maximum number of index documents which can contain a term, after which
+ * the term is considered to be a stop word
+ * @return The number of stop words identified.
+ * @throws IOException
+ */
+ public int AddStopWords(IndexReader reader, int maxDocFreq)
+ {
+ int numStopWords = 0;
+ ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
+ for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
+ String fieldName = iter.Current;
+ numStopWords += AddStopWords(reader, fieldName, maxDocFreq);
+ }
+ return numStopWords;
+ }
+
+ /**
+ * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
+ *
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
+ * exceed the required document frequency
+ * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
+ * contain a term, after which the word is considered to be a stop word.
+ * @return The number of stop words identified.
+ * @throws IOException
+ */
+ public int AddStopWords(IndexReader reader, float maxPercentDocs)
+ {
+ int numStopWords = 0;
+ ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
+ for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
+ String fieldName = iter.Current;
+ numStopWords += AddStopWords(reader, fieldName, maxPercentDocs);
+ }
+ return numStopWords;
+ }
+
+ /**
+ * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
+ *
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
+ * exceed the required document frequency
+ * @param fieldName The field for which stopwords will be added
+ * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
+ * contain a term, after which the word is considered to be a stop word.
+ * @return The number of stop words identified.
+ * @throws IOException
+ */
+ public int AddStopWords(IndexReader reader, String fieldName, float maxPercentDocs)
+ {
+ return AddStopWords(reader, fieldName, (int) (reader.NumDocs() * maxPercentDocs));
+ }
+
+ /**
+ * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
+ *
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
+ * exceed the required document frequency
+ * @param fieldName The field for which stopwords will be added
+ * @param maxDocFreq The maximum number of index documents which
+ * can contain a term, after which the term is considered to be a stop word.
+ * @return The number of stop words identified.
+ * @throws IOException
+ */
+ public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq)
+ {
+ HashSet<String> stopWords = new HashSet<String>();
+ String internedFieldName = StringHelper.Intern(fieldName);
+ TermEnum te = reader.Terms(new Term(fieldName));
+ Term term = te.Term();
+ while (term != null) {
+ if (term.Field() != internedFieldName) {
+ break;
+ }
+ if (te.DocFreq() > maxDocFreq) {
+ stopWords.Add(term.Text());
+ }
+ if (!te.Next()) {
+ break;
+ }
+ term = te.Term();
+ }
+ stopWordsPerField.Add(fieldName, stopWords);
+
+ /* if the stopwords for a field are changed,
+ * then saved streams for that field are erased.
+ */
+ IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>) GetPreviousTokenStream();
+ if (streamMap != null)
+ streamMap.Remove(fieldName);
+
+ return stopWords.Count;
+ }
+
+ public override TokenStream TokenStream(String fieldName, TextReader reader) {
+ TokenStream result;
+ try {
+ result = _delegate.ReusableTokenStream(fieldName, reader);
+ } catch (IOException e) {
+ result = _delegate.TokenStream(fieldName, reader);
+ }
+ HashSet<String> stopWords = stopWordsPerField[fieldName];
+ if (stopWords != null) {
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stopWords);
+ }
+ return result;
+ }
+
+ private class SavedStreams {
+ /* the underlying stream */
+ protected internal TokenStream Wrapped;
+
+ /*
+ * when there are no stopwords for the field, refers to wrapped.
+ * if there stopwords, it is a StopFilter around wrapped.
+ */
+ protected internal TokenStream WithStopFilter;
+ };
+
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+{
+ if (overridesTokenStreamMethod) {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return TokenStream(fieldName, reader);
+ }
+
+ /* map of SavedStreams for each field */
+ IDictionary<String, SavedStreams> streamMap = (IDictionary<String, SavedStreams>)GetPreviousTokenStream();
+ if (streamMap == null) {
+ streamMap = new HashMap<String, SavedStreams>();
+ SetPreviousTokenStream(streamMap);
+ }
+
+ SavedStreams streams = streamMap[fieldName];
+ if (streams == null) {
+ /* an entry for this field does not exist, create one */
+ streams = new SavedStreams();
+ streamMap.Add(fieldName, streams);
+ streams.Wrapped = _delegate.ReusableTokenStream(fieldName, reader);
+
+ /* if there are any stopwords for the field, save the stopfilter */
+ HashSet<String> stopWords = stopWordsPerField[fieldName];
+ if (stopWords != null)
+ streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.Wrapped, stopWords);
+ else
+ streams.WithStopFilter = streams.Wrapped;
+
+ } else {
+ /*
+ * an entry for this field exists, verify the wrapped stream has not
+ * changed. if it has not, reuse it, otherwise wrap the new stream.
+ */
+ TokenStream result = _delegate.ReusableTokenStream(fieldName, reader);
+ if (result == streams.Wrapped) {
+ /* the wrapped analyzer reused the stream */
+ streams.WithStopFilter.Reset();
+ } else {
+ /*
+ * the wrapped analyzer did not. if there are any stopwords for the
+ * field, create a new StopFilter around the new stream
+ */
+ streams.Wrapped = result;
+ HashSet<String> stopWords = stopWordsPerField[fieldName];
+ if (stopWords != null)
+ streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.Wrapped, stopWords);
+ else
+ streams.WithStopFilter = streams.Wrapped;
+ }
+ }
+
+ return streams.WithStopFilter;
+ }
+
+ /**
+ * Provides information on which stop words have been identified for a field
+ *
+ * @param fieldName The field for which stop words identified in "addStopWords"
+ * method calls will be returned
+ * @return the stop words identified for a field
+ */
+ public String[] GetStopWords(String fieldName) {
+ String[] result;
+ HashSet<String> stopWords = stopWordsPerField[fieldName];
+ if (stopWords != null) {
+ result = stopWords.ToArray();
+ } else {
+ result = new String[0];
+ }
+ return result;
+ }
+
+ /**
+ * Provides information on which stop words have been identified for all fields
+ *
+ * @return the stop words (as terms)
+ */
+ public Term[] GetStopWords() {
+ List<Term> allStopWords = new List<Term>();
+ foreach(var fieldName in stopWordsPerField.Keys)
+ {
+ HashSet<String> stopWords = stopWordsPerField[fieldName];
+ foreach(var text in stopWords) {
+ allStopWords.Add(new Term(fieldName, text));
+ }
+ }
+ return allStopWords.ToArray();
+ }
+
+}
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,128 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis.Reverse
+{
+ /**
+ * Reverse token string, for example "country" => "yrtnuoc".
+ * <p>
+ * If <code>marker</code> is supplied, then tokens will be also prepended by
+ * that character. For example, with a marker of \u0001, "country" =>
+ * "\u0001yrtnuoc". This is useful when implementing efficient leading
+ * wildcards search.
+ * </p>
+ */
+ public sealed class ReverseStringFilter : TokenFilter
+ {
+
+ private TermAttribute termAtt;
+ private readonly char marker;
+ private const char NOMARKER = '\uFFFF';
+
+ /**
+ * Example marker character: U+0001 (START OF HEADING)
+ */
+ public const char START_OF_HEADING_MARKER = '\u0001';
+
+ /**
+ * Example marker character: U+001F (INFORMATION SEPARATOR ONE)
+ */
+ public const char INFORMATION_SEPARATOR_MARKER = '\u001F';
+
+ /**
+ * Example marker character: U+EC00 (PRIVATE USE AREA: EC00)
+ */
+ public const char PUA_EC00_MARKER = '\uEC00';
+
+ /**
+ * Example marker character: U+200F (RIGHT-TO-LEFT MARK)
+ */
+ public const char RTL_DIRECTION_MARKER = '\u200F';
+
+ /**
+ * Create a new ReverseStringFilter that reverses all tokens in the
+ * supplied {@link TokenStream}.
+ * <p>
+ * The reversed tokens will not be marked.
+ * </p>
+ *
+ * @param in {@link TokenStream} to filter
+ */
+ public ReverseStringFilter(TokenStream _in)
+ : this(_in, NOMARKER)
+ {
+
+ }
+
+ /**
+ * Create a new ReverseStringFilter that reverses and marks all tokens in the
+ * supplied {@link TokenStream}.
+ * <p>
+ * The reversed tokens will be prepended (marked) by the <code>marker</code>
+ * character.
+ * </p>
+ *
+ * @param in {@link TokenStream} to filter
+ * @param marker A character used to mark reversed tokens
+ */
+ public ReverseStringFilter(TokenStream _in, char marker)
+ : base(_in)
+ {
+ this.marker = marker;
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ int len = termAtt.TermLength();
+ if (marker != NOMARKER)
+ {
+ len++;
+ termAtt.ResizeTermBuffer(len);
+ termAtt.TermBuffer()[len - 1] = marker;
+ }
+ Reverse(termAtt.TermBuffer(), len);
+ termAtt.SetTermLength(len);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ public static String Reverse(String input)
+ {
+ char[] charInput = input.ToCharArray();
+ Reverse(charInput);
+ return new String(charInput);
+ }
+
+ public static void Reverse(char[] buffer)
+ {
+ Reverse(buffer, buffer.Length);
+ }
+
+ public static void Reverse(char[] buffer, int len)
+ {
+ Reverse(buffer, 0, len);
+ }
+
+ public static void Reverse(char[] buffer, int start, int len)
+ {
+ if (len <= 1) return;
+ int num = len >> 1;
+ for (int i = start; i < (start + num); i++)
+ {
+ char c = buffer[i];
+ buffer[i] = buffer[start * 2 + len - i - 1];
+ buffer[start * 2 + len - i - 1] = c;
+ }
+ }
+ }
+}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
@@ -20,253 +20,153 @@
*/
using System;
+using System.Collections.Generic;
+using System.Linq;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// Analyzer for Russian language. Supports an external list of stopwords (words that
- /// will not be indexed at all).
- /// A default set of stopwords is used unless an alternative list is specified.
- /// </summary>
- public sealed class RussianAnalyzer : Analyzer
- {
- // letters
- private static char A = (char)0;
- private static char B = (char)1;
- private static char V = (char)2;
- private static char G = (char)3;
- private static char D = (char)4;
- private static char E = (char)5;
- private static char ZH = (char)6;
- private static char Z = (char)7;
- private static char I = (char)8;
- private static char I_ = (char)9;
- private static char K = (char)10;
- private static char L = (char)11;
- private static char M = (char)12;
- private static char N = (char)13;
- private static char O = (char)14;
- private static char P = (char)15;
- private static char R = (char)16;
- private static char S = (char)17;
- private static char T = (char)18;
- private static char U = (char)19;
- //private static char F = (char)20;
- private static char X = (char)21;
- //private static char TS = (char)22;
- private static char CH = (char)23;
- private static char SH = (char)24;
- private static char SHCH = (char)25;
- //private static char HARD = (char)26;
- private static char Y = (char)27;
- private static char SOFT = (char)28;
- private static char AE = (char)29;
- private static char IU = (char)30;
- private static char IA = (char)31;
-
- /// <summary>
- /// List of typical Russian stopwords.
- /// </summary>
- private static char[][] RUSSIAN_STOP_WORDS = {
- new char[] {A},
- new char[] {B, E, Z},
- new char[] {B, O, L, E, E},
- new char[] {B, Y},
- new char[] {B, Y, L},
- new char[] {B, Y, L, A},
- new char[] {B, Y, L, I},
- new char[] {B, Y, L, O},
- new char[] {B, Y, T, SOFT},
- new char[] {V},
- new char[] {V, A, M},
- new char[] {V, A, S},
- new char[] {V, E, S, SOFT},
- new char[] {V, O},
- new char[] {V, O, T},
- new char[] {V, S, E},
- new char[] {V, S, E, G, O},
- new char[] {V, S, E, X},
- new char[] {V, Y},
- new char[] {G, D, E},
- new char[] {D, A},
- new char[] {D, A, ZH, E},
- new char[] {D, L, IA},
- new char[] {D, O},
- new char[] {E, G, O},
- new char[] {E, E},
- new char[] {E, I_,},
- new char[] {E, IU},
- new char[] {E, S, L, I},
- new char[] {E, S, T, SOFT},
- new char[] {E, SHCH, E},
- new char[] {ZH, E},
- new char[] {Z, A},
- new char[] {Z, D, E, S, SOFT},
- new char[] {I},
- new char[] {I, Z},
- new char[] {I, L, I},
- new char[] {I, M},
- new char[] {I, X},
- new char[] {K},
- new char[] {K, A, K},
- new char[] {K, O},
- new char[] {K, O, G, D, A},
- new char[] {K, T, O},
- new char[] {L, I},
- new char[] {L, I, B, O},
- new char[] {M, N, E},
- new char[] {M, O, ZH, E, T},
- new char[] {M, Y},
- new char[] {N, A},
- new char[] {N, A, D, O},
- new char[] {N, A, SH},
- new char[] {N, E},
- new char[] {N, E, G, O},
- new char[] {N, E, E},
- new char[] {N, E, T},
- new char[] {N, I},
- new char[] {N, I, X},
- new char[] {N, O},
- new char[] {N, U},
- new char[] {O},
- new char[] {O, B},
- new char[] {O, D, N, A, K, O},
- new char[] {O, N},
- new char[] {O, N, A},
- new char[] {O, N, I},
- new char[] {O, N, O},
- new char[] {O, T},
- new char[] {O, CH, E, N, SOFT},
- new char[] {P, O},
- new char[] {P, O, D},
- new char[] {P, R, I},
- new char[] {S},
- new char[] {S, O},
- new char[] {T, A, K},
- new char[] {T, A, K, ZH, E},
- new char[] {T, A, K, O, I_},
- new char[] {T, A, M},
- new char[] {T, E},
- new char[] {T, E, M},
- new char[] {T, O},
- new char[] {T, O, G, O},
- new char[] {T, O, ZH, E},
- new char[] {T, O, I_},
- new char[] {T, O, L, SOFT, K, O},
- new char[] {T, O, M},
- new char[] {T, Y},
- new char[] {U},
- new char[] {U, ZH, E},
- new char[] {X, O, T, IA},
- new char[] {CH, E, G, O},
- new char[] {CH, E, I_},
- new char[] {CH, E, M},
- new char[] {CH, T, O},
- new char[] {CH, T, O, B, Y},
- new char[] {CH, SOFT, E},
- new char[] {CH, SOFT, IA},
- new char[] {AE, T, A},
- new char[] {AE, T, I},
- new char[] {AE, T, O},
- new char[] {IA}
- };
-
- /// <summary>
- /// Contains the stopwords used with the StopFilter.
- /// </summary>
- private Hashtable stoptable = new Hashtable();
-
- /// <summary>
- /// Charset for Russian letters.
- /// Represents encoding for 32 lowercase Russian letters.
- /// Predefined charsets can be taken from RussianCharSets class
- /// </summary>
- private char[] charset;
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- public RussianAnalyzer()
- {
- this.charset = RussianCharsets.UnicodeRussian;
- stoptable = StopFilter.MakeStopSet(MakeStopWords(RussianCharsets.UnicodeRussian));
- }
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- /// <param name="charset"></param>
- public RussianAnalyzer(char[] charset)
- {
- this.charset = charset;
- stoptable = StopFilter.MakeStopSet(MakeStopWords(charset));
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="charset"></param>
- /// <param name="stopwords"></param>
- public RussianAnalyzer(char[] charset, String[] stopwords)
- {
- this.charset = charset;
- stoptable = StopFilter.MakeStopSet(stopwords);
- }
-
- /// <summary>
- /// Takes russian stop words and translates them to a String array, using
- /// the given charset
- /// </summary>
- /// <param name="charset"></param>
- /// <returns></returns>
- private static String[] MakeStopWords(char[] charset)
- {
- String[] res = new String[RUSSIAN_STOP_WORDS.Length];
- for (int i = 0; i < res.Length; i++)
- {
- char[] theStopWord = RUSSIAN_STOP_WORDS[i];
- // translate the word,using the charset
- StringBuilder theWord = new StringBuilder();
- for (int j = 0; j < theStopWord.Length; j++)
- {
- theWord.Append(charset[theStopWord[j]]);
- }
- res[i] = theWord.ToString();
- }
- return res;
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="charset"></param>
- /// <param name="stopwords"></param>
- public RussianAnalyzer(char[] charset, Hashtable stopwords)
- {
- this.charset = charset;
- stoptable = stopwords;
- }
-
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
- /// </summary>
- /// <param name="fieldName"></param>
- /// <param name="reader"></param>
- /// <returns>
- /// A TokenStream build from a RussianLetterTokenizer filtered with
- /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
- /// </returns>
- public override TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new RussianLetterTokenizer(reader, charset);
- result = new RussianLowerCaseFilter(result, charset);
- result = new StopFilter(result, stoptable);
- result = new RussianStemFilter(result, charset);
- return result;
- }
- }
+ /// <summary>
+ /// Analyzer for Russian language. Supports an external list of stopwords (words that
+ /// will not be indexed at all).
+ /// A default set of stopwords is used unless an alternative list is specified.
+ /// </summary>
+ public sealed class RussianAnalyzer : Analyzer
+ {
+ /// <summary>
+ /// List of typical Russian stopwords.
+ /// </summary>
+ private static readonly String[] RUSSIAN_STOP_WORDS = {
+ "а", "без", "более", "бÑ", "бÑл", "бÑла", "бÑли",
+ "бÑло", "бÑÑÑ", "в",
+ "вам", "ваÑ", "веÑÑ", "во", "воÑ", "вÑе", "вÑего",
+ "вÑеÑ
", "вÑ", "где",
+ "да", "даже", "длÑ", "до", "его", "ее", "ей", "еÑ",
+ "еÑли", "еÑÑÑ",
+ "еÑе", "же", "за", "здеÑÑ", "и", "из", "или", "им",
+ "иÑ
", "к", "как",
+ "ко", "когда", "кÑо", "ли", "либо", "мне", "можеÑ",
+ "мÑ", "на", "надо",
+ "наÑ", "не", "него", "нее", "неÑ", "ни", "ниÑ
", "но",
+ "нÑ", "о", "об",
+ "однако", "он", "она", "они", "оно", "оÑ", "оÑенÑ",
+ "по", "под", "пÑи",
+ "Ñ", "Ñо", "Ñак", "Ñакже", "Ñакой", "Ñам", "Ñе", "Ñем"
+ , "Ñо", "Ñого",
+ "Ñоже", "Ñой", "ÑолÑко", "Ñом", "ÑÑ", "Ñ", "Ñже",
+ "Ñ
оÑÑ", "Ñего", "Ñей",
+ "Ñем", "ÑÑо", "ÑÑобÑ", "ÑÑе", "ÑÑÑ", "ÑÑа", "ÑÑи",
+ "ÑÑо", "Ñ"
+ };
+
+ private static class DefaultSetHolder
+ {
+ internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet(RUSSIAN_STOP_WORDS, false));
+ }
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ private readonly ISet<string> stopSet;
+
+ private readonly Version matchVersion;
+
+
+ public RussianAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+ */
+ public RussianAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+ public RussianAnalyzer(Version matchVersion, ISet<string> stopwords)
+ {
+ stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * TODO: create a Set version of this ctor
+ * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+ */
+ public RussianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+ : this(matchVersion, stopwords.Keys.ToArray())
+ {
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the
+ * provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a
+ * {@link RussianLetterTokenizer} filtered with
+ * {@link RussianLowerCaseFilter}, {@link StopFilter},
+ * and {@link RussianStemFilter}
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new RussianLetterTokenizer(reader);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stopSet);
+ result = new RussianStemFilter(result);
+ return result;
+ }
+
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a
+ * {@link RussianLetterTokenizer} filtered with
+ * {@link RussianLowerCaseFilter}, {@link StopFilter},
+ * and {@link RussianStemFilter}
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new RussianLetterTokenizer(reader);
+ streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stopSet);
+ streams.result = new RussianStemFilter(streams.result);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs Mon Nov 21 04:44:55 2011
@@ -22,42 +22,41 @@
using System;
using System.IO;
using Lucene.Net.Analysis;
+using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
- /// in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
- /// which doesn't know how to detect letters in encodings like CP1252 and KOI8
- /// (well-known problems with 0xD7 and 0xF7 chars)
- /// </summary>
- public class RussianLetterTokenizer : CharTokenizer
- {
- /// <summary>
- /// Construct a new LetterTokenizer.
- /// </summary>
- private char[] charset;
+ ///<summary>
+ /// A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+ /// by also allowing the basic latin digits 0-9.
+ ///</summary>
+ public class RussianLetterTokenizer : CharTokenizer
+ {
+ public RussianLetterTokenizer(TextReader _in)
+ : base(_in)
+ {
+ }
- public RussianLetterTokenizer(TextReader _in, char[] charset) : base(_in)
- {
- this.charset = charset;
- }
+ public RussianLetterTokenizer(AttributeSource source, TextReader _in)
+ : base(source, _in)
+ {
+ }
- /// <summary>
- /// Collects only characters which satisfy Char.IsLetter(char).
- /// </summary>
- /// <param name="c"></param>
- /// <returns></returns>
- protected override bool IsTokenChar(char c)
- {
- if (Char.IsLetter(c))
- return true;
- for (int i = 0; i < charset.Length; i++)
- {
- if (c == charset[i])
- return true;
- }
- return false;
- }
- }
+ public RussianLetterTokenizer(AttributeSource.AttributeFactory factory, TextReader __in)
+ : base(factory, __in)
+ {
+ }
+
+ /**
+ * Collects only characters which satisfy
+ * {@link Character#isLetter(char)}.
+ */
+ protected override bool IsTokenChar(char c)
+ {
+ if (char.IsLetter(c) || (c >= '0' && c <= '9'))
+ return true;
+ else
+ return false;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs Mon Nov 21 04:44:55 2011
@@ -21,41 +21,40 @@
using System;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// Normalizes token text to lower case, analyzing given ("russian") charset.
- /// </summary>
- public sealed class RussianLowerCaseFilter : TokenFilter
- {
- char[] charset;
-
- public RussianLowerCaseFilter(TokenStream _in, char[] charset) : base(_in)
- {
- this.charset = charset;
- }
-
- public override Token Next()
- {
- Token t = input.Next();
-
- if (t == null)
- return null;
-
- String txt = t.TermText();
-
- char[] chArray = txt.ToCharArray();
- for (int i = 0; i < chArray.Length; i++)
- {
- chArray[i] = RussianCharsets.ToLowerCase(chArray[i], charset);
- }
-
- String newTxt = new String(chArray);
- // create new token
- Token newToken = new Token(newTxt, t.StartOffset(), t.EndOffset());
-
- return newToken;
- }
- }
+ /// <summary>
+ /// Normalizes token text to lower case.
+ /// </summary>
+ [Obsolete("Use LowerCaseFilter instead, which has the same functionality. This filter will be removed in Lucene 4.0")]
+ public sealed class RussianLowerCaseFilter : TokenFilter
+ {
+ private TermAttribute termAtt;
+
+ public RussianLowerCaseFilter(TokenStream _in)
+ : base(_in)
+ {
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ char[] chArray = termAtt.TermBuffer();
+ int chLen = termAtt.TermLength();
+ for (int i = 0; i < chLen; i++)
+ {
+ chArray[i] = char.ToLower(chArray[i]);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -21,59 +21,65 @@
using System;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
- /// The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter,
- /// because RussianStemFilter only works with lowercase part of any "russian" charset.
- /// </summary>
- public sealed class RussianStemFilter : TokenFilter
- {
- /// <summary>
- /// The actual token in the input stream.
- /// </summary>
- private Token token = null;
- private RussianStemmer stemmer = null;
-
- public RussianStemFilter(TokenStream _in, char[] charset) : base(_in)
- {
- stemmer = new RussianStemmer(charset);
- }
-
- /// <summary>
- ///
- /// </summary>
- /// <returns>Returns the next token in the stream, or null at EOS</returns>
- public override Token Next()
- {
- if ((token = input.Next()) == null)
- {
- return null;
- }
- else
- {
- String s = stemmer.Stem(token.TermText());
- if (!s.Equals(token.TermText()))
- {
- return new Token(s, token.StartOffset(), token.EndOffset(),
- token.Type());
- }
- return token;
- }
- }
-
- /// <summary>
- /// Set a alternative/custom RussianStemmer for this filter.
- /// </summary>
- /// <param name="stemmer"></param>
- public void SetStemmer(RussianStemmer stemmer)
- {
- if (stemmer != null)
- {
- this.stemmer = stemmer;
- }
- }
- }
+ /**
+ * A {@link TokenFilter} that stems Russian words.
+ * <p>
+ * The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works with lowercase characters.
+ * </p>
+ */
+ public sealed class RussianStemFilter : TokenFilter
+ {
+ /**
+ * The actual token in the input stream.
+ */
+ private RussianStemmer stemmer = null;
+
+ private TermAttribute termAtt;
+
+ public RussianStemFilter(TokenStream _in)
+ : base(_in)
+ {
+ stemmer = new RussianStemmer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+ /**
+ * Returns the next token in the stream, or null at EOS
+ */
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String term = termAtt.Term();
+ String s = stemmer.Stem(term);
+ if (s != null && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+
+ // I don't get the point of this. All methods in java are private, so they can't be
+ // overridden...You can't really subclass any of its behavior. I've commented it out,
+ // as it doesn't compile as is. - cc
+ ////**
+ // * Set a alternative/custom {@link RussianStemmer} for this filter.
+ // */
+ //public void SetStemmer(RussianStemmer stemmer)
+ //{
+ // if (stemmer != null)
+ // {
+ // this.stemmer = stemmer;
+ // }
+ //}
+ }
}
\ No newline at end of file