You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:37 UTC
[33/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs
new file mode 100644
index 0000000..d7c385d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs
@@ -0,0 +1,1395 @@
+namespace org.apache.lucene.analysis.br
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A stemmer for Brazilian Portuguese words.
+ /// </summary>
+ public class BrazilianStemmer
+ {
+ private static readonly Locale locale = new Locale("pt", "BR");
+
+ /// <summary>
+ /// Changed term
+ /// </summary>
+ private string TERM;
+ private string CT;
+ private string R1;
+ private string R2;
+ private string RV;
+
+
+ public BrazilianStemmer()
+ {
+ }
+
+ /// <summary>
+ /// Stems the given term to an unique <tt>discriminator</tt>.
+ /// </summary>
+ /// <param name="term"> The term that should be stemmed. </param>
+ /// <returns> Discriminator for <tt>term</tt> </returns>
+ protected internal virtual string stem(string term)
+ {
+ bool altered = false; // altered the term
+
+ // creates CT
+ createCT(term);
+
+ if (!isIndexable(CT))
+ {
+ return null;
+ }
+ if (!isStemmable(CT))
+ {
+ return CT;
+ }
+
+ R1 = getR1(CT);
+ R2 = getR1(R1);
+ RV = getRV(CT);
+ TERM = term + ";" + CT;
+
+ altered = step1();
+ if (!altered)
+ {
+ altered = step2();
+ }
+
+ if (altered)
+ {
+ step3();
+ }
+ else
+ {
+ step4();
+ }
+
+ step5();
+
+ return CT;
+ }
+
+ /// <summary>
+ /// Checks a term if it can be processed correctly.
+ /// </summary>
+ /// <returns> true if, and only if, the given term consists in letters. </returns>
+ private bool isStemmable(string term)
+ {
+ for (int c = 0; c < term.Length; c++)
+ {
+ // Discard terms that contain non-letter characters.
+ if (!char.IsLetter(term[c]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /// <summary>
+ /// Checks a term if it can be processed indexed.
+ /// </summary>
+ /// <returns> true if it can be indexed </returns>
+ private bool isIndexable(string term)
+ {
+ return (term.Length < 30) && (term.Length > 2);
+ }
+
+ /// <summary>
+ /// See if string is 'a','e','i','o','u'
+ /// </summary>
+ /// <returns> true if is vowel </returns>
+ private bool isVowel(char value)
+ {
+ return (value == 'a') || (value == 'e') || (value == 'i') || (value == 'o') || (value == 'u');
+ }
+
+ /// <summary>
+ /// Gets R1
+ ///
+ /// R1 - is the region after the first non-vowel following a vowel,
+ /// or is the null region at the end of the word if there is
+ /// no such non-vowel.
+ /// </summary>
+ /// <returns> null or a string representing R1 </returns>
+ private string getR1(string value)
+ {
+ int i;
+ int j;
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ // find 1st vowel
+ i = value.Length - 1;
+ for (j = 0 ; j < i ; j++)
+ {
+ if (isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (!(j < i))
+ {
+ return null;
+ }
+
+ // find 1st non-vowel
+ for (; j < i ; j++)
+ {
+ if (!(isVowel(value[j])))
+ {
+ break;
+ }
+ }
+
+ if (!(j < i))
+ {
+ return null;
+ }
+
+ return value.Substring(j + 1);
+ }
+
+ /// <summary>
+ /// Gets RV
+ ///
+ /// RV - IF the second letter is a consonant, RV is the region after
+ /// the next following vowel,
+ ///
+ /// OR if the first two letters are vowels, RV is the region
+ /// after the next consonant,
+ ///
+ /// AND otherwise (consonant-vowel case) RV is the region after
+ /// the third letter.
+ ///
+ /// BUT RV is the end of the word if this positions cannot be
+ /// found.
+ /// </summary>
+ /// <returns> null or a string representing RV </returns>
+ private string getRV(string value)
+ {
+ int i;
+ int j;
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ i = value.Length - 1;
+
+ // RV - IF the second letter is a consonant, RV is the region after
+ // the next following vowel,
+ if ((i > 0) && !isVowel(value[1]))
+ {
+ // find 1st vowel
+ for (j = 2 ; j < i ; j++)
+ {
+ if (isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ return value.Substring(j + 1);
+ }
+ }
+
+
+ // RV - OR if the first two letters are vowels, RV is the region
+ // after the next consonant,
+ if ((i > 1) && isVowel(value[0]) && isVowel(value[1]))
+ {
+ // find 1st consoant
+ for (j = 2 ; j < i ; j++)
+ {
+ if (!isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ return value.Substring(j + 1);
+ }
+ }
+
+ // RV - AND otherwise (consonant-vowel case) RV is the region after
+ // the third letter.
+ if (i > 2)
+ {
+ return value.Substring(3);
+ }
+
+ return null;
+ }
+
+ /// <summary>
+ /// 1) Turn to lowercase
+ /// 2) Remove accents
+ /// 3) ã -> a ; õ -> o
+ /// 4) ç -> c
+ /// </summary>
+ /// <returns> null or a string transformed </returns>
+ private string changeTerm(string value)
+ {
+ int j;
+ string r = "";
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ value = value.ToLower(locale);
+ for (j = 0 ; j < value.Length ; j++)
+ {
+ if ((value[j] == 'á') || (value[j] == 'â') || (value[j] == 'ã'))
+ {
+ r = r + "a";
+ continue;
+ }
+ if ((value[j] == 'é') || (value[j] == 'ê'))
+ {
+ r = r + "e";
+ continue;
+ }
+ if (value[j] == 'í')
+ {
+ r = r + "i";
+ continue;
+ }
+ if ((value[j] == 'ó') || (value[j] == 'ô') || (value[j] == 'õ'))
+ {
+ r = r + "o";
+ continue;
+ }
+ if ((value[j] == 'ú') || (value[j] == 'ü'))
+ {
+ r = r + "u";
+ continue;
+ }
+ if (value[j] == 'ç')
+ {
+ r = r + "c";
+ continue;
+ }
+ if (value[j] == 'ñ')
+ {
+ r = r + "n";
+ continue;
+ }
+
+ r = r + value[j];
+ }
+
+ return r;
+ }
+
+ /// <summary>
+ /// Check if a string ends with a suffix
+ /// </summary>
+ /// <returns> true if the string ends with the specified suffix </returns>
+ private bool suffix(string value, string suffix)
+ {
+
+ // be-safe !!!
+ if ((value == null) || (suffix_Renamed == null))
+ {
+ return false;
+ }
+
+ if (suffix_Renamed.Length > value.Length)
+ {
+ return false;
+ }
+
+ return value.Substring(value.Length - suffix_Renamed.Length).Equals(suffix_Renamed);
+ }
+
+ /// <summary>
+ /// Replace a string suffix by another
+ /// </summary>
+ /// <returns> the replaced String </returns>
+ private string replaceSuffix(string value, string toReplace, string changeTo)
+ {
+ string vvalue;
+
+ // be-safe !!!
+ if ((value == null) || (toReplace == null) || (changeTo == null))
+ {
+ return value;
+ }
+
+ vvalue = removeSuffix(value,toReplace);
+
+ if (value.Equals(vvalue))
+ {
+ return value;
+ }
+ else
+ {
+ return vvalue + changeTo;
+ }
+ }
+
+ /// <summary>
+ /// Remove a string suffix
+ /// </summary>
+ /// <returns> the String without the suffix </returns>
+ private string removeSuffix(string value, string toRemove)
+ {
+ // be-safe !!!
+ if ((value == null) || (toRemove == null) || !suffix(value,toRemove))
+ {
+ return value;
+ }
+
+ return value.Substring(0,value.Length - toRemove.Length);
+ }
+
+ /// <summary>
+ /// See if a suffix is preceded by a String
+ /// </summary>
+ /// <returns> true if the suffix is preceded </returns>
+ private bool suffixPreceded(string value, string suffix, string preceded)
+ {
+ // be-safe !!!
+ if ((value == null) || (suffix_Renamed == null) || (preceded == null) || !suffix(value,suffix_Renamed))
+ {
+ return false;
+ }
+
+ return suffix(removeSuffix(value,suffix_Renamed),preceded);
+ }
+
+ /// <summary>
+ /// Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
+ /// </summary>
+ private void createCT(string term)
+ {
+ CT = changeTerm(term);
+
+ if (CT.Length < 2)
+ {
+ return;
+ }
+
+ // if the first character is ... , remove it
+ if ((CT[0] == '"') || (CT[0] == '\'') || (CT[0] == '-') || (CT[0] == ',') || (CT[0] == ';') || (CT[0] == '.') || (CT[0] == '?') || (CT[0] == '!'))
+ {
+ CT = CT.Substring(1);
+ }
+
+ if (CT.Length < 2)
+ {
+ return;
+ }
+
+ // if the last character is ... , remove it
+ if ((CT[CT.Length - 1] == '-') || (CT[CT.Length - 1] == ',') || (CT[CT.Length - 1] == ';') || (CT[CT.Length - 1] == '.') || (CT[CT.Length - 1] == '?') || (CT[CT.Length - 1] == '!') || (CT[CT.Length - 1] == '\'') || (CT[CT.Length - 1] == '"'))
+ {
+ CT = CT.Substring(0,CT.Length - 1);
+ }
+ }
+
+
+ /// <summary>
+ /// Standard suffix removal.
+ /// Search for the longest among the following suffixes, and perform
+ /// the following actions:
+ /// </summary>
+ /// <returns> false if no ending was removed </returns>
+ private bool step1()
+ {
+ if (CT == null)
+ {
+ return false;
+ }
+
+ // suffix length = 7
+ if (suffix(CT,"uciones") && suffix(R2,"uciones"))
+ {
+ CT = replaceSuffix(CT,"uciones","u");
+ return true;
+ }
+
+ // suffix length = 6
+ if (CT.Length >= 6)
+ {
+ if (suffix(CT,"imentos") && suffix(R2,"imentos"))
+ {
+ CT = removeSuffix(CT,"imentos");
+ return true;
+ }
+ if (suffix(CT,"amentos") && suffix(R2,"amentos"))
+ {
+ CT = removeSuffix(CT,"amentos");
+ return true;
+ }
+ if (suffix(CT,"adores") && suffix(R2,"adores"))
+ {
+ CT = removeSuffix(CT,"adores");
+ return true;
+ }
+ if (suffix(CT,"adoras") && suffix(R2,"adoras"))
+ {
+ CT = removeSuffix(CT,"adoras");
+ return true;
+ }
+ if (suffix(CT,"logias") && suffix(R2,"logias"))
+ {
+ replaceSuffix(CT,"logias","log");
+ return true;
+ }
+ if (suffix(CT,"encias") && suffix(R2,"encias"))
+ {
+ CT = replaceSuffix(CT,"encias","ente");
+ return true;
+ }
+ if (suffix(CT,"amente") && suffix(R1,"amente"))
+ {
+ CT = removeSuffix(CT,"amente");
+ return true;
+ }
+ if (suffix(CT,"idades") && suffix(R2,"idades"))
+ {
+ CT = removeSuffix(CT,"idades");
+ return true;
+ }
+ }
+
+ // suffix length = 5
+ if (CT.Length >= 5)
+ {
+ if (suffix(CT,"acoes") && suffix(R2,"acoes"))
+ {
+ CT = removeSuffix(CT,"acoes");
+ return true;
+ }
+ if (suffix(CT,"imento") && suffix(R2,"imento"))
+ {
+ CT = removeSuffix(CT,"imento");
+ return true;
+ }
+ if (suffix(CT,"amento") && suffix(R2,"amento"))
+ {
+ CT = removeSuffix(CT,"amento");
+ return true;
+ }
+ if (suffix(CT,"adora") && suffix(R2,"adora"))
+ {
+ CT = removeSuffix(CT,"adora");
+ return true;
+ }
+ if (suffix(CT,"ismos") && suffix(R2,"ismos"))
+ {
+ CT = removeSuffix(CT,"ismos");
+ return true;
+ }
+ if (suffix(CT,"istas") && suffix(R2,"istas"))
+ {
+ CT = removeSuffix(CT,"istas");
+ return true;
+ }
+ if (suffix(CT,"logia") && suffix(R2,"logia"))
+ {
+ CT = replaceSuffix(CT,"logia","log");
+ return true;
+ }
+ if (suffix(CT,"ucion") && suffix(R2,"ucion"))
+ {
+ CT = replaceSuffix(CT,"ucion","u");
+ return true;
+ }
+ if (suffix(CT,"encia") && suffix(R2,"encia"))
+ {
+ CT = replaceSuffix(CT,"encia","ente");
+ return true;
+ }
+ if (suffix(CT,"mente") && suffix(R2,"mente"))
+ {
+ CT = removeSuffix(CT,"mente");
+ return true;
+ }
+ if (suffix(CT,"idade") && suffix(R2,"idade"))
+ {
+ CT = removeSuffix(CT,"idade");
+ return true;
+ }
+ }
+
+ // suffix length = 4
+ if (CT.Length >= 4)
+ {
+ if (suffix(CT,"acao") && suffix(R2,"acao"))
+ {
+ CT = removeSuffix(CT,"acao");
+ return true;
+ }
+ if (suffix(CT,"ezas") && suffix(R2,"ezas"))
+ {
+ CT = removeSuffix(CT,"ezas");
+ return true;
+ }
+ if (suffix(CT,"icos") && suffix(R2,"icos"))
+ {
+ CT = removeSuffix(CT,"icos");
+ return true;
+ }
+ if (suffix(CT,"icas") && suffix(R2,"icas"))
+ {
+ CT = removeSuffix(CT,"icas");
+ return true;
+ }
+ if (suffix(CT,"ismo") && suffix(R2,"ismo"))
+ {
+ CT = removeSuffix(CT,"ismo");
+ return true;
+ }
+ if (suffix(CT,"avel") && suffix(R2,"avel"))
+ {
+ CT = removeSuffix(CT,"avel");
+ return true;
+ }
+ if (suffix(CT,"ivel") && suffix(R2,"ivel"))
+ {
+ CT = removeSuffix(CT,"ivel");
+ return true;
+ }
+ if (suffix(CT,"ista") && suffix(R2,"ista"))
+ {
+ CT = removeSuffix(CT,"ista");
+ return true;
+ }
+ if (suffix(CT,"osos") && suffix(R2,"osos"))
+ {
+ CT = removeSuffix(CT,"osos");
+ return true;
+ }
+ if (suffix(CT,"osas") && suffix(R2,"osas"))
+ {
+ CT = removeSuffix(CT,"osas");
+ return true;
+ }
+ if (suffix(CT,"ador") && suffix(R2,"ador"))
+ {
+ CT = removeSuffix(CT,"ador");
+ return true;
+ }
+ if (suffix(CT,"ivas") && suffix(R2,"ivas"))
+ {
+ CT = removeSuffix(CT,"ivas");
+ return true;
+ }
+ if (suffix(CT,"ivos") && suffix(R2,"ivos"))
+ {
+ CT = removeSuffix(CT,"ivos");
+ return true;
+ }
+ if (suffix(CT,"iras") && suffix(RV,"iras") && suffixPreceded(CT,"iras","e"))
+ {
+ CT = replaceSuffix(CT,"iras","ir");
+ return true;
+ }
+ }
+
+ // suffix length = 3
+ if (CT.Length >= 3)
+ {
+ if (suffix(CT,"eza") && suffix(R2,"eza"))
+ {
+ CT = removeSuffix(CT,"eza");
+ return true;
+ }
+ if (suffix(CT,"ico") && suffix(R2,"ico"))
+ {
+ CT = removeSuffix(CT,"ico");
+ return true;
+ }
+ if (suffix(CT,"ica") && suffix(R2,"ica"))
+ {
+ CT = removeSuffix(CT,"ica");
+ return true;
+ }
+ if (suffix(CT,"oso") && suffix(R2,"oso"))
+ {
+ CT = removeSuffix(CT,"oso");
+ return true;
+ }
+ if (suffix(CT,"osa") && suffix(R2,"osa"))
+ {
+ CT = removeSuffix(CT,"osa");
+ return true;
+ }
+ if (suffix(CT,"iva") && suffix(R2,"iva"))
+ {
+ CT = removeSuffix(CT,"iva");
+ return true;
+ }
+ if (suffix(CT,"ivo") && suffix(R2,"ivo"))
+ {
+ CT = removeSuffix(CT,"ivo");
+ return true;
+ }
+ if (suffix(CT,"ira") && suffix(RV,"ira") && suffixPreceded(CT,"ira","e"))
+ {
+ CT = replaceSuffix(CT,"ira","ir");
+ return true;
+ }
+ }
+
+ // no ending was removed by step1
+ return false;
+ }
+
+
+ /// <summary>
+ /// Verb suffixes.
+ ///
+ /// Search for the longest among the following suffixes in RV,
+ /// and if found, delete.
+ /// </summary>
+ /// <returns> false if no ending was removed </returns>
+ private bool step2()
+ {
+ if (RV == null)
+ {
+ return false;
+ }
+
+ // suffix lenght = 7
+ if (RV.Length >= 7)
+ {
+ if (suffix(RV,"issemos"))
+ {
+ CT = removeSuffix(CT,"issemos");
+ return true;
+ }
+ if (suffix(RV,"essemos"))
+ {
+ CT = removeSuffix(CT,"essemos");
+ return true;
+ }
+ if (suffix(RV,"assemos"))
+ {
+ CT = removeSuffix(CT,"assemos");
+ return true;
+ }
+ if (suffix(RV,"ariamos"))
+ {
+ CT = removeSuffix(CT,"ariamos");
+ return true;
+ }
+ if (suffix(RV,"eriamos"))
+ {
+ CT = removeSuffix(CT,"eriamos");
+ return true;
+ }
+ if (suffix(RV,"iriamos"))
+ {
+ CT = removeSuffix(CT,"iriamos");
+ return true;
+ }
+ }
+
+ // suffix length = 6
+ if (RV.Length >= 6)
+ {
+ if (suffix(RV,"iremos"))
+ {
+ CT = removeSuffix(CT,"iremos");
+ return true;
+ }
+ if (suffix(RV,"eremos"))
+ {
+ CT = removeSuffix(CT,"eremos");
+ return true;
+ }
+ if (suffix(RV,"aremos"))
+ {
+ CT = removeSuffix(CT,"aremos");
+ return true;
+ }
+ if (suffix(RV,"avamos"))
+ {
+ CT = removeSuffix(CT,"avamos");
+ return true;
+ }
+ if (suffix(RV,"iramos"))
+ {
+ CT = removeSuffix(CT,"iramos");
+ return true;
+ }
+ if (suffix(RV,"eramos"))
+ {
+ CT = removeSuffix(CT,"eramos");
+ return true;
+ }
+ if (suffix(RV,"aramos"))
+ {
+ CT = removeSuffix(CT,"aramos");
+ return true;
+ }
+ if (suffix(RV,"asseis"))
+ {
+ CT = removeSuffix(CT,"asseis");
+ return true;
+ }
+ if (suffix(RV,"esseis"))
+ {
+ CT = removeSuffix(CT,"esseis");
+ return true;
+ }
+ if (suffix(RV,"isseis"))
+ {
+ CT = removeSuffix(CT,"isseis");
+ return true;
+ }
+ if (suffix(RV,"arieis"))
+ {
+ CT = removeSuffix(CT,"arieis");
+ return true;
+ }
+ if (suffix(RV,"erieis"))
+ {
+ CT = removeSuffix(CT,"erieis");
+ return true;
+ }
+ if (suffix(RV,"irieis"))
+ {
+ CT = removeSuffix(CT,"irieis");
+ return true;
+ }
+ }
+
+
+ // suffix length = 5
+ if (RV.Length >= 5)
+ {
+ if (suffix(RV,"irmos"))
+ {
+ CT = removeSuffix(CT,"irmos");
+ return true;
+ }
+ if (suffix(RV,"iamos"))
+ {
+ CT = removeSuffix(CT,"iamos");
+ return true;
+ }
+ if (suffix(RV,"armos"))
+ {
+ CT = removeSuffix(CT,"armos");
+ return true;
+ }
+ if (suffix(RV,"ermos"))
+ {
+ CT = removeSuffix(CT,"ermos");
+ return true;
+ }
+ if (suffix(RV,"areis"))
+ {
+ CT = removeSuffix(CT,"areis");
+ return true;
+ }
+ if (suffix(RV,"ereis"))
+ {
+ CT = removeSuffix(CT,"ereis");
+ return true;
+ }
+ if (suffix(RV,"ireis"))
+ {
+ CT = removeSuffix(CT,"ireis");
+ return true;
+ }
+ if (suffix(RV,"asses"))
+ {
+ CT = removeSuffix(CT,"asses");
+ return true;
+ }
+ if (suffix(RV,"esses"))
+ {
+ CT = removeSuffix(CT,"esses");
+ return true;
+ }
+ if (suffix(RV,"isses"))
+ {
+ CT = removeSuffix(CT,"isses");
+ return true;
+ }
+ if (suffix(RV,"astes"))
+ {
+ CT = removeSuffix(CT,"astes");
+ return true;
+ }
+ if (suffix(RV,"assem"))
+ {
+ CT = removeSuffix(CT,"assem");
+ return true;
+ }
+ if (suffix(RV,"essem"))
+ {
+ CT = removeSuffix(CT,"essem");
+ return true;
+ }
+ if (suffix(RV,"issem"))
+ {
+ CT = removeSuffix(CT,"issem");
+ return true;
+ }
+ if (suffix(RV,"ardes"))
+ {
+ CT = removeSuffix(CT,"ardes");
+ return true;
+ }
+ if (suffix(RV,"erdes"))
+ {
+ CT = removeSuffix(CT,"erdes");
+ return true;
+ }
+ if (suffix(RV,"irdes"))
+ {
+ CT = removeSuffix(CT,"irdes");
+ return true;
+ }
+ if (suffix(RV,"ariam"))
+ {
+ CT = removeSuffix(CT,"ariam");
+ return true;
+ }
+ if (suffix(RV,"eriam"))
+ {
+ CT = removeSuffix(CT,"eriam");
+ return true;
+ }
+ if (suffix(RV,"iriam"))
+ {
+ CT = removeSuffix(CT,"iriam");
+ return true;
+ }
+ if (suffix(RV,"arias"))
+ {
+ CT = removeSuffix(CT,"arias");
+ return true;
+ }
+ if (suffix(RV,"erias"))
+ {
+ CT = removeSuffix(CT,"erias");
+ return true;
+ }
+ if (suffix(RV,"irias"))
+ {
+ CT = removeSuffix(CT,"irias");
+ return true;
+ }
+ if (suffix(RV,"estes"))
+ {
+ CT = removeSuffix(CT,"estes");
+ return true;
+ }
+ if (suffix(RV,"istes"))
+ {
+ CT = removeSuffix(CT,"istes");
+ return true;
+ }
+ if (suffix(RV,"areis"))
+ {
+ CT = removeSuffix(CT,"areis");
+ return true;
+ }
+ if (suffix(RV,"aveis"))
+ {
+ CT = removeSuffix(CT,"aveis");
+ return true;
+ }
+ }
+
+ // suffix length = 4
+ if (RV.Length >= 4)
+ {
+ if (suffix(RV,"aria"))
+ {
+ CT = removeSuffix(CT,"aria");
+ return true;
+ }
+ if (suffix(RV,"eria"))
+ {
+ CT = removeSuffix(CT,"eria");
+ return true;
+ }
+ if (suffix(RV,"iria"))
+ {
+ CT = removeSuffix(CT,"iria");
+ return true;
+ }
+ if (suffix(RV,"asse"))
+ {
+ CT = removeSuffix(CT,"asse");
+ return true;
+ }
+ if (suffix(RV,"esse"))
+ {
+ CT = removeSuffix(CT,"esse");
+ return true;
+ }
+ if (suffix(RV,"isse"))
+ {
+ CT = removeSuffix(CT,"isse");
+ return true;
+ }
+ if (suffix(RV,"aste"))
+ {
+ CT = removeSuffix(CT,"aste");
+ return true;
+ }
+ if (suffix(RV,"este"))
+ {
+ CT = removeSuffix(CT,"este");
+ return true;
+ }
+ if (suffix(RV,"iste"))
+ {
+ CT = removeSuffix(CT,"iste");
+ return true;
+ }
+ if (suffix(RV,"arei"))
+ {
+ CT = removeSuffix(CT,"arei");
+ return true;
+ }
+ if (suffix(RV,"erei"))
+ {
+ CT = removeSuffix(CT,"erei");
+ return true;
+ }
+ if (suffix(RV,"irei"))
+ {
+ CT = removeSuffix(CT,"irei");
+ return true;
+ }
+ if (suffix(RV,"aram"))
+ {
+ CT = removeSuffix(CT,"aram");
+ return true;
+ }
+ if (suffix(RV,"eram"))
+ {
+ CT = removeSuffix(CT,"eram");
+ return true;
+ }
+ if (suffix(RV,"iram"))
+ {
+ CT = removeSuffix(CT,"iram");
+ return true;
+ }
+ if (suffix(RV,"avam"))
+ {
+ CT = removeSuffix(CT,"avam");
+ return true;
+ }
+ if (suffix(RV,"arem"))
+ {
+ CT = removeSuffix(CT,"arem");
+ return true;
+ }
+ if (suffix(RV,"erem"))
+ {
+ CT = removeSuffix(CT,"erem");
+ return true;
+ }
+ if (suffix(RV,"irem"))
+ {
+ CT = removeSuffix(CT,"irem");
+ return true;
+ }
+ if (suffix(RV,"ando"))
+ {
+ CT = removeSuffix(CT,"ando");
+ return true;
+ }
+ if (suffix(RV,"endo"))
+ {
+ CT = removeSuffix(CT,"endo");
+ return true;
+ }
+ if (suffix(RV,"indo"))
+ {
+ CT = removeSuffix(CT,"indo");
+ return true;
+ }
+ if (suffix(RV,"arao"))
+ {
+ CT = removeSuffix(CT,"arao");
+ return true;
+ }
+ if (suffix(RV,"erao"))
+ {
+ CT = removeSuffix(CT,"erao");
+ return true;
+ }
+ if (suffix(RV,"irao"))
+ {
+ CT = removeSuffix(CT,"irao");
+ return true;
+ }
+ if (suffix(RV,"adas"))
+ {
+ CT = removeSuffix(CT,"adas");
+ return true;
+ }
+ if (suffix(RV,"idas"))
+ {
+ CT = removeSuffix(CT,"idas");
+ return true;
+ }
+ if (suffix(RV,"aras"))
+ {
+ CT = removeSuffix(CT,"aras");
+ return true;
+ }
+ if (suffix(RV,"eras"))
+ {
+ CT = removeSuffix(CT,"eras");
+ return true;
+ }
+ if (suffix(RV,"iras"))
+ {
+ CT = removeSuffix(CT,"iras");
+ return true;
+ }
+ if (suffix(RV,"avas"))
+ {
+ CT = removeSuffix(CT,"avas");
+ return true;
+ }
+ if (suffix(RV,"ares"))
+ {
+ CT = removeSuffix(CT,"ares");
+ return true;
+ }
+ if (suffix(RV,"eres"))
+ {
+ CT = removeSuffix(CT,"eres");
+ return true;
+ }
+ if (suffix(RV,"ires"))
+ {
+ CT = removeSuffix(CT,"ires");
+ return true;
+ }
+ if (suffix(RV,"ados"))
+ {
+ CT = removeSuffix(CT,"ados");
+ return true;
+ }
+ if (suffix(RV,"idos"))
+ {
+ CT = removeSuffix(CT,"idos");
+ return true;
+ }
+ if (suffix(RV,"amos"))
+ {
+ CT = removeSuffix(CT,"amos");
+ return true;
+ }
+ if (suffix(RV,"emos"))
+ {
+ CT = removeSuffix(CT,"emos");
+ return true;
+ }
+ if (suffix(RV,"imos"))
+ {
+ CT = removeSuffix(CT,"imos");
+ return true;
+ }
+ if (suffix(RV,"iras"))
+ {
+ CT = removeSuffix(CT,"iras");
+ return true;
+ }
+ if (suffix(RV,"ieis"))
+ {
+ CT = removeSuffix(CT,"ieis");
+ return true;
+ }
+ }
+
+ // suffix length = 3
+ if (RV.Length >= 3)
+ {
+ if (suffix(RV,"ada"))
+ {
+ CT = removeSuffix(CT,"ada");
+ return true;
+ }
+ if (suffix(RV,"ida"))
+ {
+ CT = removeSuffix(CT,"ida");
+ return true;
+ }
+ if (suffix(RV,"ara"))
+ {
+ CT = removeSuffix(CT,"ara");
+ return true;
+ }
+ if (suffix(RV,"era"))
+ {
+ CT = removeSuffix(CT,"era");
+ return true;
+ }
+ if (suffix(RV,"ira"))
+ {
+ CT = removeSuffix(CT,"ava");
+ return true;
+ }
+ if (suffix(RV,"iam"))
+ {
+ CT = removeSuffix(CT,"iam");
+ return true;
+ }
+ if (suffix(RV,"ado"))
+ {
+ CT = removeSuffix(CT,"ado");
+ return true;
+ }
+ if (suffix(RV,"ido"))
+ {
+ CT = removeSuffix(CT,"ido");
+ return true;
+ }
+ if (suffix(RV,"ias"))
+ {
+ CT = removeSuffix(CT,"ias");
+ return true;
+ }
+ if (suffix(RV,"ais"))
+ {
+ CT = removeSuffix(CT,"ais");
+ return true;
+ }
+ if (suffix(RV,"eis"))
+ {
+ CT = removeSuffix(CT,"eis");
+ return true;
+ }
+ if (suffix(RV,"ira"))
+ {
+ CT = removeSuffix(CT,"ira");
+ return true;
+ }
+ if (suffix(RV,"ear"))
+ {
+ CT = removeSuffix(CT,"ear");
+ return true;
+ }
+ }
+
+ // suffix length = 2
+ if (RV.Length >= 2)
+ {
+ if (suffix(RV,"ia"))
+ {
+ CT = removeSuffix(CT,"ia");
+ return true;
+ }
+ if (suffix(RV,"ei"))
+ {
+ CT = removeSuffix(CT,"ei");
+ return true;
+ }
+ if (suffix(RV,"am"))
+ {
+ CT = removeSuffix(CT,"am");
+ return true;
+ }
+ if (suffix(RV,"em"))
+ {
+ CT = removeSuffix(CT,"em");
+ return true;
+ }
+ if (suffix(RV,"ar"))
+ {
+ CT = removeSuffix(CT,"ar");
+ return true;
+ }
+ if (suffix(RV,"er"))
+ {
+ CT = removeSuffix(CT,"er");
+ return true;
+ }
+ if (suffix(RV,"ir"))
+ {
+ CT = removeSuffix(CT,"ir");
+ return true;
+ }
+ if (suffix(RV,"as"))
+ {
+ CT = removeSuffix(CT,"as");
+ return true;
+ }
+ if (suffix(RV,"es"))
+ {
+ CT = removeSuffix(CT,"es");
+ return true;
+ }
+ if (suffix(RV,"is"))
+ {
+ CT = removeSuffix(CT,"is");
+ return true;
+ }
+ if (suffix(RV,"eu"))
+ {
+ CT = removeSuffix(CT,"eu");
+ return true;
+ }
+ if (suffix(RV,"iu"))
+ {
+ CT = removeSuffix(CT,"iu");
+ return true;
+ }
+ if (suffix(RV,"iu"))
+ {
+ CT = removeSuffix(CT,"iu");
+ return true;
+ }
+ if (suffix(RV,"ou"))
+ {
+ CT = removeSuffix(CT,"ou");
+ return true;
+ }
+ }
+
+ // no ending was removed by step2
+ return false;
+ }
+
+ /// <summary>
+ /// Delete suffix 'i' if in RV and preceded by 'c'
+ ///
+ /// </summary>
+ private void step3()
+ {
+ if (RV == null)
+ {
+ return;
+ }
+
+ if (suffix(RV,"i") && suffixPreceded(RV,"i","c"))
+ {
+ CT = removeSuffix(CT,"i");
+ }
+
+ }
+
+ /// <summary>
+ /// Residual suffix
+ ///
+ /// If the word ends with one of the suffixes (os a i o á í ó)
+ /// in RV, delete it
+ ///
+ /// </summary>
+ private void step4()
+ {
+ if (RV == null)
+ {
+ return;
+ }
+
+ if (suffix(RV,"os"))
+ {
+ CT = removeSuffix(CT,"os");
+ return;
+ }
+ if (suffix(RV,"a"))
+ {
+ CT = removeSuffix(CT,"a");
+ return;
+ }
+ if (suffix(RV,"i"))
+ {
+ CT = removeSuffix(CT,"i");
+ return;
+ }
+ if (suffix(RV,"o"))
+ {
+ CT = removeSuffix(CT,"o");
+ return;
+ }
+
+ }
+
+ /// <summary>
+ /// If the word ends with one of ( e é ê) in RV,delete it,
+ /// and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
+ /// delete the 'u' (or 'i')
+ ///
+ /// Or if the word ends ç remove the cedilha
+ ///
+ /// </summary>
+ private void step5()
+ {
+ if (RV == null)
+ {
+ return;
+ }
+
+ if (suffix(RV,"e"))
+ {
+ if (suffixPreceded(RV,"e","gu"))
+ {
+ CT = removeSuffix(CT,"e");
+ CT = removeSuffix(CT,"u");
+ return;
+ }
+
+ if (suffixPreceded(RV,"e","ci"))
+ {
+ CT = removeSuffix(CT,"e");
+ CT = removeSuffix(CT,"i");
+ return;
+ }
+
+ CT = removeSuffix(CT,"e");
+ return;
+ }
+ }
+
+ /// <summary>
+ /// For log and debug purpose
+ /// </summary>
+ /// <returns> TERM, CT, RV, R1 and R2 </returns>
+ public virtual string log()
+ {
+ return " (TERM = " + TERM + ")" + " (CT = " + CT + ")" + " (RV = " + RV + ")" + " (R1 = " + R1 + ")" + " (R2 = " + R2 + ")";
+ }
+
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs
new file mode 100644
index 0000000..939d358
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs
@@ -0,0 +1,154 @@
+using System;
+
+namespace org.apache.lucene.analysis.ca
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using ElisionFilter = org.apache.lucene.analysis.util.ElisionFilter;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using Version = org.apache.lucene.util.Version;
+ using CatalanStemmer = org.tartarus.snowball.ext.CatalanStemmer;
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for Catalan.
+ /// <para>
+ /// <a name="version"/>
+ /// </para>
+ /// <para>You must specify the required <seealso cref="Version"/>
+ /// compatibility when creating CatalanAnalyzer:
+ /// <ul>
+ /// <li> As of 3.6, ElisionFilter with a set of Catalan
+ /// contractions is used by default.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public sealed class CatalanAnalyzer : StopwordAnalyzerBase
+ {
+ private readonly CharArraySet stemExclusionSet;
+
+ /// <summary>
+ /// File containing default Catalan stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ private static readonly CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList("d", "l", "m", "n", "s", "t"), true));
+
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop words set. </summary>
+ /// <returns> default stop words set. </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+ }
+
+ /// <summary>
+ /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ /// accesses the static final set the first time.;
+ /// </summary>
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_STOP_SET = loadStopwordSet(false, typeof(CatalanAnalyzer), DEFAULT_STOPWORD_FILE, "#");
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+ /// </summary>
+ public CatalanAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+ /// stemming.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+ public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+ {
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /// <summary>
+ /// Creates a
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> A
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from an <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>,
+ /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+ /// provided and <seealso cref="SnowballFilter"/>. </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_36))
+ {
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
+ }
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.Empty)
+ {
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
+ result = new SnowballFilter(result, new CatalanStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs
new file mode 100644
index 0000000..1127842
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis.CharFilter
+{
+ /// <summary>
+ /// Base utility class for implementing a <seealso cref="CharFilter"/>.
+ /// You subclass this, and then record mappings by calling
+ /// <seealso cref="#addOffCorrectMap"/>, and then invoke the correct
+ /// method to correct an offset.
+ /// </summary>
+ public abstract class BaseCharFilter : CharFilter
+ {
+
+ private int[] offsets;
+ private int[] diffs;
+ private int size = 0;
+
+ public BaseCharFilter(Reader @in) : base(@in)
+ {
+ }
+
+ /// <summary>
+ /// Retrieve the corrected offset. </summary>
+ protected internal override int correct(int currentOff)
+ {
+ if (offsets == null || currentOff < offsets[0])
+ {
+ return currentOff;
+ }
+
+ int hi = size - 1;
+ if (currentOff >= offsets[hi])
+ {
+ return currentOff + diffs[hi];
+ }
+
+ int lo = 0;
+ int mid = -1;
+
+ while (hi >= lo)
+ {
+ mid = (int)((uint)(lo + hi) >> 1);
+ if (currentOff < offsets[mid])
+ {
+ hi = mid - 1;
+ }
+ else if (currentOff > offsets[mid])
+ {
+ lo = mid + 1;
+ }
+ else
+ {
+ return currentOff + diffs[mid];
+ }
+ }
+
+ if (currentOff < offsets[mid])
+ {
+ return mid == 0 ? currentOff : currentOff + diffs[mid - 1];
+ }
+ else
+ {
+ return currentOff + diffs[mid];
+ }
+ }
+
+ protected internal virtual int LastCumulativeDiff
+ {
+ get
+ {
+ return offsets == null ? 0 : diffs[size-1];
+ }
+ }
+
+ /// <summary>
+ /// <para>
+ /// Adds an offset correction mapping at the given output stream offset.
+ /// </para>
+ /// <para>
+ /// Assumption: the offset given with each successive call to this method
+ /// will not be smaller than the offset given at the previous invocation.
+ /// </para>
+ /// </summary>
+ /// <param name="off"> The output stream offset at which to apply the correction </param>
+ /// <param name="cumulativeDiff"> The input offset is given by adding this
+ /// to the output offset </param>
+ protected internal virtual void addOffCorrectMap(int off, int cumulativeDiff)
+ {
+ if (offsets == null)
+ {
+ offsets = new int[64];
+ diffs = new int[64];
+ }
+ else if (size == offsets.Length)
+ {
+ offsets = ArrayUtil.grow(offsets);
+ diffs = ArrayUtil.grow(diffs);
+ }
+
+ assert(size == 0 || off >= offsets[size - 1]) : "Offset #" + size + "(" + off + ") is less than the last recorded offset " + offsets[size - 1] + "\n" + Arrays.ToString(offsets) + "\n" + Arrays.ToString(diffs);
+
+ if (size == 0 || off != offsets[size - 1])
+ {
+ offsets[size] = off;
+ diffs[size++] = cumulativeDiff;
+ } // Overwrite the diff at the last recorded offset
+ else
+ {
+ diffs[size - 1] = cumulativeDiff;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs
new file mode 100644
index 0000000..2d527fc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs
@@ -0,0 +1,67 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using CharFilterFactory = org.apache.lucene.analysis.util.CharFilterFactory;
+
+
+ /// <summary>
+ /// Factory for <seealso cref="HTMLStripCharFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" />
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class HTMLStripCharFilterFactory : CharFilterFactory
+ {
+ internal readonly HashSet<string> escapedTags;
+ internal static readonly Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
+
+ /// <summary>
+ /// Creates a new HTMLStripCharFilterFactory </summary>
+ public HTMLStripCharFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ escapedTags = getSet(args, "escapedTags");
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override HTMLStripCharFilter create(Reader input)
+ {
+ HTMLStripCharFilter charFilter;
+ if (null == escapedTags)
+ {
+ charFilter = new HTMLStripCharFilter(input);
+ }
+ else
+ {
+ charFilter = new HTMLStripCharFilter(input, escapedTags);
+ }
+ return charFilter;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs
new file mode 100644
index 0000000..5a148be
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs
@@ -0,0 +1,240 @@
+using System;
+using System.Diagnostics;
+using System.Collections.Generic;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+using Lucene.Net.Analysis.CharFilter;
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+
+ using RollingCharBuffer = org.apache.lucene.analysis.util.RollingCharBuffer;
+ using CharsRef = org.apache.lucene.util.CharsRef;
+ using CharSequenceOutputs = org.apache.lucene.util.fst.CharSequenceOutputs;
+ using FST = org.apache.lucene.util.fst.FST;
+ using Outputs = org.apache.lucene.util.fst.Outputs;
+
+ /// <summary>
+ /// Simplistic <seealso cref="CharFilter"/> that applies the mappings
+ /// contained in a <seealso cref="NormalizeCharMap"/> to the character
+ /// stream, and correcting the resulting changes to the
+ /// offsets. Matching is greedy (longest pattern matching at
+ /// a given point wins). Replacement is allowed to be the
+ /// empty string.
+ /// </summary>
+
+ public class MappingCharFilter : BaseCharFilter
+ {
+
+ private readonly Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
+ private readonly FST<CharsRef> map;
+ private readonly FST.BytesReader fstReader;
+ private readonly RollingCharBuffer buffer = new RollingCharBuffer();
+ private readonly FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
+ private readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs;
+
+ private CharsRef replacement;
+ private int replacementPointer;
+ private int inputOff;
+
+ /// <summary>
+ /// Default constructor that takes a <seealso cref="Reader"/>. </summary>
+ public MappingCharFilter(NormalizeCharMap normMap, Reader @in) : base(@in)
+ {
+ buffer.reset(@in);
+
+ map = normMap.map;
+ cachedRootArcs = normMap.cachedRootArcs;
+
+ if (map != null)
+ {
+ fstReader = map.BytesReader;
+ }
+ else
+ {
+ fstReader = null;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ input.reset();
+ buffer.reset(input);
+ replacement = null;
+ inputOff = 0;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public int read() throws java.io.IOException
+ public override int read()
+ {
+
+ //System.out.println("\nread");
+ while (true)
+ {
+
+ if (replacement != null && replacementPointer < replacement.length)
+ {
+ //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
+ return replacement.chars[replacement.offset + replacementPointer++];
+ }
+
+ // TODO: a more efficient approach would be Aho/Corasick's
+ // algorithm
+ // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
+ // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
+ //
+ // I think this would be (almost?) equivalent to 1) adding
+ // epsilon arcs from all final nodes back to the init
+ // node in the FST, 2) adding a .* (skip any char)
+ // loop on the initial node, and 3) determinizing
+ // that. Then we would not have to restart matching
+ // at each position.
+
+ int lastMatchLen = -1;
+ CharsRef lastMatch = null;
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int firstCH = buffer.get(inputOff);
+ int firstCH = buffer.get(inputOff);
+ if (firstCH != -1)
+ {
+ FST.Arc<CharsRef> arc = cachedRootArcs[Convert.ToChar((char) firstCH)];
+ if (arc != null)
+ {
+ if (!FST.targetHasArcs(arc))
+ {
+ // Fast pass for single character match:
+ Debug.Assert(arc.Final);
+ lastMatchLen = 1;
+ lastMatch = arc.output;
+ }
+ else
+ {
+ int lookahead = 0;
+ CharsRef output = arc.output;
+ while (true)
+ {
+ lookahead++;
+
+ if (arc.Final)
+ {
+ // Match! (to node is final)
+ lastMatchLen = lookahead;
+ lastMatch = outputs.add(output, arc.nextFinalOutput);
+ // Greedy: keep searching to see if there's a
+ // longer match...
+ }
+
+ if (!FST.targetHasArcs(arc))
+ {
+ break;
+ }
+
+ int ch = buffer.get(inputOff + lookahead);
+ if (ch == -1)
+ {
+ break;
+ }
+ if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null)
+ {
+ // Dead end
+ break;
+ }
+ output = outputs.add(output, arc.output);
+ }
+ }
+ }
+ }
+
+ if (lastMatch != null)
+ {
+ inputOff += lastMatchLen;
+ //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int diff = lastMatchLen - lastMatch.length;
+ int diff = lastMatchLen - lastMatch.length;
+
+ if (diff != 0)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int prevCumulativeDiff = getLastCumulativeDiff();
+ int prevCumulativeDiff = LastCumulativeDiff;
+ if (diff > 0)
+ {
+ // Replacement is shorter than matched input:
+ addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
+ }
+ else
+ {
+ // Replacement is longer than matched input: remap
+ // the "extra" chars all back to the same input
+ // offset:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff;
+ int outputStart = inputOff - prevCumulativeDiff;
+ for (int extraIDX = 0;extraIDX < -diff;extraIDX++)
+ {
+ addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
+ }
+ }
+ }
+
+ replacement = lastMatch;
+ replacementPointer = 0;
+
+ }
+ else
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int ret = buffer.get(inputOff);
+ int ret = buffer.get(inputOff);
+ if (ret != -1)
+ {
+ inputOff++;
+ buffer.freeBefore(inputOff);
+ }
+ return ret;
+ }
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public int read(char[] cbuf, int off, int len) throws java.io.IOException
+ public override int read(char[] cbuf, int off, int len)
+ {
+ int numRead = 0;
+ for (int i = off; i < off + len; i++)
+ {
+ int c = read();
+ if (c == -1)
+ {
+ break;
+ }
+ cbuf[i] = (char) c;
+ numRead++;
+ }
+
+ return numRead == 0 ? - 1 : numRead;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs
new file mode 100644
index 0000000..4489b7c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs
@@ -0,0 +1,184 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Util;
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using AbstractAnalysisFactory = AbstractAnalysisFactory;
+ using CharFilterFactory = org.apache.lucene.analysis.util.CharFilterFactory;
+ using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+ using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+ using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+
+ /// <summary>
+ /// Factory for <seealso cref="MappingCharFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_map" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ ///
+ /// @since Solr 1.4
+ /// </summary>
+ public class MappingCharFilterFactory : CharFilterFactory, ResourceLoaderAware, MultiTermAwareComponent
+ {
+
+ protected internal NormalizeCharMap normMap;
+ private readonly string mapping;
+
+ /// <summary>
+ /// Creates a new MappingCharFilterFactory </summary>
+ public MappingCharFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ mapping = get(args, "mapping");
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ // TODO: this should use inputstreams from the loader, not File!
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+ public virtual void inform(ResourceLoader loader)
+ {
+ if (mapping != null)
+ {
+ IList<string> wlist = null;
+ File mappingFile = new File(mapping);
+ if (mappingFile.exists())
+ {
+ wlist = getLines(loader, mapping);
+ }
+ else
+ {
+ IList<string> files = splitFileNames(mapping);
+ wlist = new List<>();
+ foreach (string file in files)
+ {
+ IList<string> lines = getLines(loader, file.Trim());
+ wlist.AddRange(lines);
+ }
+ }
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+ NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+ parseRules(wlist, builder);
+ normMap = builder.build();
+ if (normMap.map == null)
+ {
+ // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
+ // so just set the whole map to null
+ normMap = null;
+ }
+ }
+ }
+
+ public override Reader create(Reader input)
+ {
+ // if the map is null, it means there's actually no mappings... just return the original stream
+ // as there is nothing to do here.
+ return normMap == null ? input : new MappingCharFilter(normMap,input);
+ }
+
+ // "source" => "target"
+ internal static Pattern p = Pattern.compile("\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$");
+
+ protected internal virtual void parseRules(IList<string> rules, NormalizeCharMap.Builder builder)
+ {
+ foreach (string rule in rules)
+ {
+ Matcher m = p.matcher(rule);
+ if (!m.find())
+ {
+ throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
+ }
+ builder.add(parseString(m.group(1)), parseString(m.group(2)));
+ }
+ }
+
+ internal char[] @out = new char[256];
+
+ protected internal virtual string parseString(string s)
+ {
+ int readPos = 0;
+ int len = s.Length;
+ int writePos = 0;
+ while (readPos < len)
+ {
+ char c = s[readPos++];
+ if (c == '\\')
+ {
+ if (readPos >= len)
+ {
+ throw new System.ArgumentException("Invalid escaped char in [" + s + "]");
+ }
+ c = s[readPos++];
+ switch (c)
+ {
+ case '\\' :
+ c = '\\';
+ break;
+ case '"' :
+ c = '"';
+ break;
+ case 'n' :
+ c = '\n';
+ break;
+ case 't' :
+ c = '\t';
+ break;
+ case 'r' :
+ c = '\r';
+ break;
+ case 'b' :
+ c = '\b';
+ break;
+ case 'f' :
+ c = '\f';
+ break;
+ case 'u' :
+ if (readPos + 3 >= len)
+ {
+ throw new System.ArgumentException("Invalid escaped char in [" + s + "]");
+ }
+ c = (char)int.Parse(s.Substring(readPos, 4), 16);
+ readPos += 4;
+ break;
+ }
+ }
+ @out[writePos++] = c;
+ }
+ return new string(@out, 0, writePos);
+ }
+
+ public virtual AbstractAnalysisFactory MultiTermComponent
+ {
+ get
+ {
+ return this;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs
new file mode 100644
index 0000000..ade4318
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs
@@ -0,0 +1,162 @@
+using System;
+using System.Diagnostics;
+using System.Collections.Generic;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+
+ using CharsRef = org.apache.lucene.util.CharsRef;
+ using IntsRef = org.apache.lucene.util.IntsRef;
+ using Builder = org.apache.lucene.util.fst.Builder;
+ using CharSequenceOutputs = org.apache.lucene.util.fst.CharSequenceOutputs;
+ using FST = org.apache.lucene.util.fst.FST;
+ using Outputs = org.apache.lucene.util.fst.Outputs;
+ using Util = org.apache.lucene.util.fst.Util;
+
+ // TODO: save/load?
+
+ /// <summary>
+ /// Holds a map of String input to String output, to be used
+ /// with <seealso cref="MappingCharFilter"/>. Use the <seealso cref="Builder"/>
+ /// to create this.
+ /// </summary>
+ public class NormalizeCharMap
+ {
+
+ internal readonly FST<CharsRef> map;
+ internal readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs = new Dictionary<char?, FST.Arc<CharsRef>>();
+
+ // Use the builder to create:
+ private NormalizeCharMap(FST<CharsRef> map)
+ {
+ this.map = map;
+ if (map != null)
+ {
+ try
+ {
+ // Pre-cache root arcs:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> scratchArc = new org.apache.lucene.util.fst.FST.Arc<>();
+ FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader fstReader = map.getBytesReader();
+ FST.BytesReader fstReader = map.BytesReader;
+ map.getFirstArc(scratchArc);
+ if (FST.targetHasArcs(scratchArc))
+ {
+ map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
+ while (true)
+ {
+ Debug.Assert(scratchArc.label != FST.END_LABEL);
+ cachedRootArcs[Convert.ToChar((char) scratchArc.label)] = (new FST.Arc<CharsRef>()).copyFrom(scratchArc);
+ if (scratchArc.Last)
+ {
+ break;
+ }
+ map.readNextRealArc(scratchArc, fstReader);
+ }
+ }
+ //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
+ }
+ catch (IOException ioe)
+ {
+ // Bogus FST IOExceptions!! (will never happen)
+ throw new Exception(ioe);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an NormalizeCharMap.
+ /// <para>
+ /// Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+ public class Builder
+ {
+
+ internal readonly IDictionary<string, string> pendingPairs = new SortedDictionary<string, string>();
+
+ /// <summary>
+ /// Records a replacement to be applied to the input
+ /// stream. Whenever <code>singleMatch</code> occurs in
+ /// the input, it will be replaced with
+ /// <code>replacement</code>.
+ /// </summary>
+ /// <param name="match"> input String to be replaced </param>
+ /// <param name="replacement"> output String </param>
+ /// <exception cref="IllegalArgumentException"> if
+ /// <code>match</code> is the empty string, or was
+ /// already previously added </exception>
+ public virtual void add(string match, string replacement)
+ {
+ if (match.Length == 0)
+ {
+ throw new System.ArgumentException("cannot match the empty string");
+ }
+ if (pendingPairs.ContainsKey(match))
+ {
+ throw new System.ArgumentException("match \"" + match + "\" was already added");
+ }
+ pendingPairs[match] = replacement;
+ }
+
+ /// <summary>
+ /// Builds the NormalizeCharMap; call this once you
+ /// are done calling <seealso cref="#add"/>.
+ /// </summary>
+ public virtual NormalizeCharMap build()
+ {
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> map;
+ FST<CharsRef> map;
+ try
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.Outputs<org.apache.lucene.util.CharsRef> outputs = org.apache.lucene.util.fst.CharSequenceOutputs.getSingleton();
+ Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.Builder<org.apache.lucene.util.CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(org.apache.lucene.util.fst.FST.INPUT_TYPE.BYTE2, outputs);
+ Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratch = new org.apache.lucene.util.IntsRef();
+ IntsRef scratch = new IntsRef();
+ foreach (KeyValuePair<string, string> ent in pendingPairs.SetOfKeyValuePairs())
+ {
+ builder.add(Util.toUTF16(ent.Key, scratch), new CharsRef(ent.Value));
+ }
+ map = builder.finish();
+ pendingPairs.Clear();
+ }
+ catch (IOException ioe)
+ {
+ // Bogus FST IOExceptions!! (will never happen)
+ throw new Exception(ioe);
+ }
+
+ return new NormalizeCharMap(map);
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs
new file mode 100644
index 0000000..801fd45
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs
@@ -0,0 +1,118 @@
+using System;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// An <seealso cref="Analyzer"/> that tokenizes text with <seealso cref="StandardTokenizer"/>,
+ /// normalizes content with <seealso cref="CJKWidthFilter"/>, folds case with
+ /// <seealso cref="LowerCaseFilter"/>, forms bigrams of CJK with <seealso cref="CJKBigramFilter"/>,
+ /// and filters stopwords with <seealso cref="StopFilter"/>
+ /// </summary>
+ public sealed class CJKAnalyzer : StopwordAnalyzerBase
+ {
+ /// <summary>
+ /// File containing default CJK stopwords.
+ /// <p/>
+ /// Currently it contains some common English words that are not usually
+ /// useful for searching and some double-byte interpunctions.
+ /// </summary>
+ public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop-words set. </summary>
+ /// <returns> an unmodifiable instance of the default stop-words set. </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+ }
+
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_STOP_SET = loadStopwordSet(false, typeof(CJKAnalyzer), DEFAULT_STOPWORD_FILE, "#");
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an analyzer which removes words in <seealso cref="#getDefaultStopSet()"/>.
+ /// </summary>
+ public CJKAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words
+ /// </summary>
+ /// <param name="matchVersion">
+ /// lucene compatibility version </param>
+ /// <param name="stopwords">
+ /// a stopword set </param>
+ public CJKAnalyzer(Version matchVersion, CharArraySet stopwords) : base(matchVersion, stopwords)
+ {
+ }
+
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+ if (matchVersion.onOrAfter(Version.LUCENE_36))
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ // run the widthfilter first before bigramming, it sometimes combines characters.
+ TokenStream result = new CJKWidthFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new CJKBigramFilter(result);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
+ }
+ else
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader);
+ Tokenizer source = new CJKTokenizer(reader);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs
new file mode 100644
index 0000000..4ad6f5f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs
@@ -0,0 +1,420 @@
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+ using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+
+ /// <summary>
+ /// Forms bigrams of CJK terms that are generated from StandardTokenizer
+ /// or ICUTokenizer.
+ /// <para>
+ /// CJK types are set by these tokenizers, but you can also use
+ /// <seealso cref="#CJKBigramFilter(TokenStream, int)"/> to explicitly control which
+ /// of the CJK scripts are turned into bigrams.
+ /// </para>
+ /// <para>
+ /// By default, when a CJK character has no adjacent characters to form
+ /// a bigram, it is output in unigram form. If you want to always output
+ /// both unigrams and bigrams, set the <code>outputUnigrams</code>
+ /// flag in <seealso cref="CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)"/>.
+ /// This can be used for a combined unigram+bigram approach.
+ /// </para>
+ /// <para>
+ /// In all cases, all non-CJK input is passed thru unmodified.
+ /// </para>
+ /// </summary>
+ public sealed class CJKBigramFilter : TokenFilter
+ {
+ // configuration
+ /// <summary>
+ /// bigram flag for Han Ideographs </summary>
+ public const int HAN = 1;
+ /// <summary>
+ /// bigram flag for Hiragana </summary>
+ public const int HIRAGANA = 2;
+ /// <summary>
+ /// bigram flag for Katakana </summary>
+ public const int KATAKANA = 4;
+ /// <summary>
+ /// bigram flag for Hangul </summary>
+ public const int HANGUL = 8;
+
+ /// <summary>
+ /// when we emit a bigram, its then marked as this type </summary>
+ public const string DOUBLE_TYPE = "<DOUBLE>";
+ /// <summary>
+ /// when we emit a unigram, its then marked as this type </summary>
+ public const string SINGLE_TYPE = "<SINGLE>";
+
+ // the types from standardtokenizer
+ private static readonly string HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+ private static readonly string HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ private static readonly string KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+ private static readonly string HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
+ // sentinel value for ignoring a script
+ private static readonly object NO = new object();
+
+ // these are set to either their type or NO if we want to pass them thru
+ private readonly object doHan;
+ private readonly object doHiragana;
+ private readonly object doKatakana;
+ private readonly object doHangul;
+
+ // true if we should output unigram tokens always
+ private readonly bool outputUnigrams;
+ private bool ngramState; // false = output unigram, true = output bigram
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+ private readonly PositionLengthAttribute posLengthAtt = addAttribute(typeof(PositionLengthAttribute));
+
+ // buffers containing codepoint and offsets in parallel
+ internal int[] buffer = new int[8];
+ internal int[] startOffset = new int[8];
+ internal int[] endOffset = new int[8];
+ // length of valid buffer
+ internal int bufferLen;
+ // current buffer index
+ internal int index;
+
+ // the last end offset, to determine if we should bigram across tokens
+ internal int lastEndOffset;
+
+ private bool exhausted;
+
+ /// <summary>
+ /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
+ /// CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
+ /// </summary>
+ public CJKBigramFilter(TokenStream @in) : this(@in, HAN | HIRAGANA | KATAKANA | HANGUL)
+ {
+ }
+
+ /// <summary>
+ /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
+ /// CJKBigramFilter(in, flags, false)}
+ /// </summary>
+ public CJKBigramFilter(TokenStream @in, int flags) : this(@in, flags, false)
+ {
+ }
+
+ /// <summary>
+ /// Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
+ /// and whether or not unigrams should also be output. </summary>
+ /// <param name="flags"> OR'ed set from <seealso cref="CJKBigramFilter#HAN"/>, <seealso cref="CJKBigramFilter#HIRAGANA"/>,
+ /// <seealso cref="CJKBigramFilter#KATAKANA"/>, <seealso cref="CJKBigramFilter#HANGUL"/> </param>
+ /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output.
+ /// when this is false, this is only done when there are no adjacent characters to form
+ /// a bigram. </param>
+ public CJKBigramFilter(TokenStream @in, int flags, bool outputUnigrams) : base(@in)
+ {
+ doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
+ doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
+ doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
+ doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
+ this.outputUnigrams = outputUnigrams;
+ }
+
+ /*
+ * much of this complexity revolves around handling the special case of a
+ * "lone cjk character" where cjktokenizer would output a unigram. this
+ * is also the only time we ever have to captureState.
+ */
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ while (true)
+ {
+ if (hasBufferedBigram())
+ {
+
+ // case 1: we have multiple remaining codepoints buffered,
+ // so we can emit a bigram here.
+
+ if (outputUnigrams)
+ {
+
+ // when also outputting unigrams, we output the unigram first,
+ // then rewind back to revisit the bigram.
+ // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
+ // the logic in hasBufferedUnigram ensures we output the C,
+ // even though it did actually have adjacent CJK characters.
+
+ if (ngramState)
+ {
+ flushBigram();
+ }
+ else
+ {
+ flushUnigram();
+ index--;
+ }
+ ngramState = !ngramState;
+ }
+ else
+ {
+ flushBigram();
+ }
+ return true;
+ }
+ else if (doNext())
+ {
+
+ // case 2: look at the token type. should we form any n-grams?
+
+ string type = typeAtt.type();
+ if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul)
+ {
+
+ // acceptable CJK type: we form n-grams from these.
+ // as long as the offsets are aligned, we just add these to our current buffer.
+ // otherwise, we clear the buffer and start over.
+
+ if (offsetAtt.startOffset() != lastEndOffset) // unaligned, clear queue
+ {
+ if (hasBufferedUnigram())
+ {
+
+ // we have a buffered unigram, and we peeked ahead to see if we could form
+ // a bigram, but we can't, because the offsets are unaligned. capture the state
+ // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+
+ loneState = captureState();
+ flushUnigram();
+ return true;
+ }
+ index = 0;
+ bufferLen = 0;
+ }
+ refill();
+ }
+ else
+ {
+
+ // not a CJK type: we just return these as-is.
+
+ if (hasBufferedUnigram())
+ {
+
+ // we have a buffered unigram, and we peeked ahead to see if we could form
+ // a bigram, but we can't, because its not a CJK type. capture the state
+ // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+
+ loneState = captureState();
+ flushUnigram();
+ return true;
+ }
+ return true;
+ }
+ }
+ else
+ {
+
+ // case 3: we have only zero or 1 codepoints buffered,
+ // so not enough to form a bigram. But, we also have no
+ // more input. So if we have a buffered codepoint, emit
+ // a unigram, otherwise, its end of stream.
+
+ if (hasBufferedUnigram())
+ {
+ flushUnigram(); // flush our remaining unigram
+ return true;
+ }
+ return false;
+ }
+ }
+ }
+
+ private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
+
+ /// <summary>
+ /// looks at next input token, returning false is none is available
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private boolean doNext() throws java.io.IOException
+ private bool doNext()
+ {
+ if (loneState != null)
+ {
+ restoreState(loneState);
+ loneState = null;
+ return true;
+ }
+ else
+ {
+ if (exhausted)
+ {
+ return false;
+ }
+ else if (input.incrementToken())
+ {
+ return true;
+ }
+ else
+ {
+ exhausted = true;
+ return false;
+ }
+ }
+ }
+
+ /// <summary>
+ /// refills buffers with new data from the current token.
+ /// </summary>
+ private void refill()
+ {
+ // compact buffers to keep them smallish if they become large
+ // just a safety check, but technically we only need the last codepoint
+ if (bufferLen > 64)
+ {
+ int last = bufferLen - 1;
+ buffer[0] = buffer[last];
+ startOffset[0] = startOffset[last];
+ endOffset[0] = endOffset[last];
+ bufferLen = 1;
+ index -= last;
+ }
+
+ char[] termBuffer = termAtt.buffer();
+ int len = termAtt.length();
+ int start = offsetAtt.startOffset();
+ int end = offsetAtt.endOffset();
+
+ int newSize = bufferLen + len;
+ buffer = ArrayUtil.grow(buffer, newSize);
+ startOffset = ArrayUtil.grow(startOffset, newSize);
+ endOffset = ArrayUtil.grow(endOffset, newSize);
+ lastEndOffset = end;
+
+ if (end - start != len)
+ {
+ // crazy offsets (modified by synonym or charfilter): just preserve
+ for (int i = 0, cp = 0; i < len; i += char.charCount(cp))
+ {
+ cp = buffer[bufferLen] = char.codePointAt(termBuffer, i, len);
+ startOffset[bufferLen] = start;
+ endOffset[bufferLen] = end;
+ bufferLen++;
+ }
+ }
+ else
+ {
+ // normal offsets
+ for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen)
+ {
+ cp = buffer[bufferLen] = char.codePointAt(termBuffer, i, len);
+ cpLen = char.charCount(cp);
+ startOffset[bufferLen] = start;
+ start = endOffset[bufferLen] = start + cpLen;
+ bufferLen++;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Flushes a bigram token to output from our buffer
+ /// This is the normal case, e.g. ABC -> AB BC
+ /// </summary>
+ private void flushBigram()
+ {
+ clearAttributes();
+ char[] termBuffer = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
+ int len1 = char.toChars(buffer[index], termBuffer, 0);
+ int len2 = len1 + char.toChars(buffer[index + 1], termBuffer, len1);
+ termAtt.Length = len2;
+ offsetAtt.setOffset(startOffset[index], endOffset[index + 1]);
+ typeAtt.Type = DOUBLE_TYPE;
+ // when outputting unigrams, all bigrams are synonyms that span two unigrams
+ if (outputUnigrams)
+ {
+ posIncAtt.PositionIncrement = 0;
+ posLengthAtt.PositionLength = 2;
+ }
+ index++;
+ }
+
+ /// <summary>
+ /// Flushes a unigram token to output from our buffer.
+ /// This happens when we encounter isolated CJK characters, either the whole
+ /// CJK string is a single character, or we encounter a CJK character surrounded
+ /// by space, punctuation, english, etc, but not beside any other CJK.
+ /// </summary>
+ private void flushUnigram()
+ {
+ clearAttributes();
+ char[] termBuffer = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
+ int len = char.toChars(buffer[index], termBuffer, 0);
+ termAtt.Length = len;
+ offsetAtt.setOffset(startOffset[index], endOffset[index]);
+ typeAtt.Type = SINGLE_TYPE;
+ index++;
+ }
+
+ /// <summary>
+ /// True if we have multiple codepoints sitting in our buffer
+ /// </summary>
+ private bool hasBufferedBigram()
+ {
+ return bufferLen - index > 1;
+ }
+
+ /// <summary>
+ /// True if we have a single codepoint sitting in our buffer, where its future
+ /// (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
+ /// inputs.
+ /// </summary>
+ private bool hasBufferedUnigram()
+ {
+ if (outputUnigrams)
+ {
+ // when outputting unigrams always
+ return bufferLen - index == 1;
+ }
+ else
+ {
+ // otherwise its only when we have a lone CJK character
+ return bufferLen == 1 && index == 0;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ bufferLen = 0;
+ index = 0;
+ lastEndOffset = 0;
+ loneState = null;
+ exhausted = false;
+ ngramState = false;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs
new file mode 100644
index 0000000..9783238
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs
@@ -0,0 +1,79 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="CJKBigramFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_cjk" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.CJKWidthFilterFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.CJKBigramFilterFactory"
+ /// han="true" hiragana="true"
+ /// katakana="true" hangul="true" outputUnigrams="false" />
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class CJKBigramFilterFactory : TokenFilterFactory
+ {
+ internal readonly int flags;
+ internal readonly bool outputUnigrams;
+
+ /// <summary>
+ /// Creates a new CJKBigramFilterFactory </summary>
+ public CJKBigramFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ int flags = 0;
+ if (getBoolean(args, "han", true))
+ {
+ flags |= CJKBigramFilter.HAN;
+ }
+ if (getBoolean(args, "hiragana", true))
+ {
+ flags |= CJKBigramFilter.HIRAGANA;
+ }
+ if (getBoolean(args, "katakana", true))
+ {
+ flags |= CJKBigramFilter.KATAKANA;
+ }
+ if (getBoolean(args, "hangul", true))
+ {
+ flags |= CJKBigramFilter.HANGUL;
+ }
+ this.flags = flags;
+ this.outputUnigrams = getBoolean(args, "outputUnigrams", false);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new CJKBigramFilter(input, flags, outputUnigrams);
+ }
+ }
+
+}
\ No newline at end of file