You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2011/11/21 05:44:59 UTC
[Lucene.Net] svn commit: r1204353 [5/9] - in
/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src:
contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/
contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/
contrib/Analyzers/Compoun...
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemmer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemmer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemmer.cs Mon Nov 21 04:44:55 2011
@@ -24,783 +24,699 @@ using System.Text;
namespace Lucene.Net.Analysis.Fr
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// A stemmer for French words. The algorithm is based on the work of
- /// Dr Martin Porter on his snowball project<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html<br/>
- /// (French stemming algorithm) for details
- ///
- /// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
- /// <version>$Id: FrenchStemmer.java,v 1.2 2004/01/22 20:54:47 ehatcher Exp $</version>
- /// </summary>
- public class FrenchStemmer
- {
-
- /// <summary>
- /// Buffer for the terms while stemming them.
- /// </summary>
- private StringBuilder sb = new StringBuilder();
-
- /// <summary>
- /// A temporary buffer, used to reconstruct R2
- /// </summary>
- private StringBuilder tb = new StringBuilder();
-
- /// <summary>
- /// Region R0 is equal to the whole buffer
- /// </summary>
- private String R0;
-
- /// <summary>
- /// Region RV
- /// "If the word begins with two vowels, RV is the region after the third letter,
- /// otherwise the region after the first vowel not at the beginning of the word,
- /// or the end of the word if these positions cannot be found."
- /// </summary>
- private String RV;
-
- /// <summary>
- /// Region R1
- /// "R1 is the region after the first non-vowel following a vowel
- /// or is the null region at the end of the word if there is no such non-vowel"
- /// </summary>
- private String R1;
-
- /// <summary>
- /// Region R2
- /// "R2 is the region after the first non-vowel in R1 following a vowel
- /// or is the null region at the end of the word if there is no such non-vowel"
- /// </summary>
- private String R2;
-
-
- /// <summary>
- /// Set to true if we need to perform step 2
- /// </summary>
- private bool suite;
-
- /// <summary>
- /// Set to true if the buffer was modified
- /// </summary>
- private bool modified;
-
- /// <summary>
- /// Stemms the given term to a unique <tt>discriminator</tt>.
- /// </summary>
- /// <param name="term">
- /// java.langString The term that should be stemmed
- /// </param>
- /// <returns>
- /// Discriminator for <tt>term</tt>
- /// </returns>
- protected internal String Stem( String term )
- {
- if ( !IsStemmable( term ) )
- {
- return term;
- }
-
- // Use lowercase for medium stemming.
- term = term.ToLower();
-
- // Reset the StringBuilder.
- sb.Remove( 0, sb.Length );
- sb.Append( term );
-
- // reset the booleans
- modified = false;
- suite = false;
-
- sb = TreatVowels( sb );
-
- SetStrings();
-
- Step1();
-
- if (!modified || suite)
- {
- if (RV != null)
- {
- suite = Step2a();
- if (!suite)
- Step2b();
- }
- }
+
+/**
+ * A stemmer for French words.
+ * <p>
+ * The algorithm is based on the work of
+ * Dr Martin Porter on his snowball project<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
+ * (French stemming algorithm) for details
+ * </p>
+ */
+
+public class FrenchStemmer {
+
+ /**
+ * Buffer for the terms while stemming them.
+ */
+ private StringBuilder sb = new StringBuilder();
+
+ /**
+ * A temporary buffer, used to reconstruct R2
+ */
+ private StringBuilder tb = new StringBuilder();
- if (modified || suite)
- Step3();
- else
- Step4();
+ /**
+ * Region R0 is equal to the whole buffer
+ */
+ private String R0;
- Step5();
+ /**
+ * Region RV
+ * "If the word begins with two vowels, RV is the region after the third letter,
+ * otherwise the region after the first vowel not at the beginning of the word,
+ * or the end of the word if these positions cannot be found."
+ */
+ private String RV;
- Step6();
+ /**
+ * Region R1
+ * "R1 is the region after the first non-vowel following a vowel
+ * or is the null region at the end of the word if there is no such non-vowel"
+ */
+ private String R1;
- return sb.ToString();
- }
+ /**
+ * Region R2
+ * "R2 is the region after the first non-vowel in R1 following a vowel
+ * or is the null region at the end of the word if there is no such non-vowel"
+ */
+ private String R2;
- /// <summary>
- /// Sets the search region Strings<br/>
- /// it needs to be done each time the buffer was modified
- /// </summary>
- private void SetStrings()
- {
- // set the strings
- R0 = sb.ToString();
- RV = RetrieveRV( sb );
- R1 = RetrieveR( sb );
- if ( R1 != null )
- {
- tb.Remove( 0, tb.Length );
- tb.Append( R1 );
- R2 = RetrieveR( tb );
- }
- else
- R2 = null;
- }
- /// <summary>
- /// First step of the Porter Algorithmn<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- private void Step1( )
- {
- String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
- DeleteFrom( R2, suffix );
+ /**
+ * Set to true if we need to perform step 2
+ */
+ private bool suite;
- ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" );
- ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
- ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
+ /**
+ * Set to true if the buffer was modified
+ */
+ private bool modified;
- String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
- DeleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
- DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
- DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
- DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
- DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
- DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
+ /**
+ * Stems the given term to a unique <tt>discriminator</tt>.
+ *
+ * @param term java.langString The term that should be stemmed
+ * @return java.lang.String Discriminator for <tt>term</tt>
+ */
+ protected internal String Stem( String term ) {
+ if ( !IsStemmable( term ) ) {
+ return term;
+ }
- DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
- DeleteFrom( RV, new String[] { "ements", "ement" } );
+ // Use lowercase for medium stemming.
+ term = term.ToLower();
- DeleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
- DeleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
- DeleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
+ // Reset the StringBuilder.
+ sb.Remove( 0, sb.Length );
+ sb.Insert( 0, term );
- String[] autre = { "ifs", "ives", "if", "ive" };
- DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
- DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
+ // reset the bools
+ modified = false;
+ suite = false;
- ReplaceFrom( R0, new String[] { "eaux" }, "eau" );
+ sb = TreatVowels( sb );
- ReplaceFrom( R1, new String[] { "aux" }, "al" );
+ SetStrings();
- DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
+ Step1();
- DeleteFrom( R2, new String[] { "eux" } );
+ if (!modified || suite)
+ {
+ if (RV != null)
+ {
+ suite = Step2A();
+ if (!suite)
+ Step2B();
+ }
+ }
- // if one of the next steps is performed, we will need to perform step2a
- bool temp = false;
- temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" );
- if (temp == true)
- suite = true;
- temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" );
- if (temp == true)
- suite = true;
- temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
- if (temp == true)
- suite = true;
+ if (modified || suite)
+ Step3();
+ else
+ Step4();
- }
+ Step5();
- /// <summary>
- /// Second step (A) of the Porter Algorithmn<br/>
- /// Will be performed if nothing changed from the first step
- /// or changed were done in the amment, emment, ments or ment suffixes<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- /// <returns>
- /// true if something changed in the StringBuilder
- /// </returns>
- private bool Step2a()
- {
- String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
- "irent", "iriez", "irez", "irions", "irons", "iront",
- "issaIent", "issais", "issantes", "issante", "issants", "issant",
- "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
- "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
- return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV );
+ Step6();
+
+ return sb.ToString();
+ }
+
+ /**
+ * Sets the search region Strings<br>
+ * it needs to be done each time the buffer was modified
+ */
+ private void SetStrings() {
+ // set the strings
+ R0 = sb.ToString();
+ RV = RetrieveRV( sb );
+ R1 = RetrieveR( sb );
+ if ( R1 != null )
+ {
+ tb.Remove( 0, tb.Length );
+ tb.Insert( 0, R1 );
+ R2 = RetrieveR( tb );
}
+ else
+ R2 = null;
+ }
- /// <summary>
- /// Second step (B) of the Porter Algorithmn<br/>
- /// Will be performed if step 2 A was performed unsuccessfully<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- private void Step2b()
- {
- String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
- "erons", "eront","erez", "èrent", "era", "ées", "iez",
- "ée", "és", "er", "ez", "é" };
- DeleteFrom( RV, suffix );
+ /**
+ * First step of the Porter Algorithm<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ */
+ private void Step1( ) {
+ String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
+ DeleteFrom( R2, suffix );
+
+ ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" );
+ ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
+ ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
+
+ String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
+ DeleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
+
+ DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
+ DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
+ DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
+ DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
+ DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
+
+ DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
+ DeleteFrom( RV, new String[] { "ements", "ement" } );
+
+ DeleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
+ DeleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
+ DeleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
+
+ String[] autre = { "ifs", "ives", "if", "ive" };
+ DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
+ DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
+
+ ReplaceFrom( R0, new String[] { "eaux" }, "eau" );
+
+ ReplaceFrom( R1, new String[] { "aux" }, "al" );
+
+ DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
+
+ DeleteFrom( R2, new String[] { "eux" } );
+
+ // if one of the next steps is performed, we will need to perform step2a
+ bool temp = false;
+ temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" );
+ if (temp == true)
+ suite = true;
+ temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" );
+ if (temp == true)
+ suite = true;
+ temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
+ if (temp == true)
+ suite = true;
- String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
- "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
- "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
- DeleteButSuffixFrom( RV, search, "e", true );
+ }
- DeleteFrom( R2, new String[] { "ions" } );
- }
+ /**
+ * Second step (A) of the Porter Algorithm<br>
+ * Will be performed if nothing changed from the first step
+ * or changed were done in the amment, emment, ments or ment suffixes<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ *
+ * @return bool - true if something changed in the StringBuilder
+ */
+ private bool Step2A() {
+ String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
+ "irent", "iriez", "irez", "irions", "irons", "iront",
+ "issaIent", "issais", "issantes", "issante", "issants", "issant",
+ "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
+ "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
+ return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV );
+ }
+
+ /**
+ * Second step (B) of the Porter Algorithm<br>
+ * Will be performed if step 2 A was performed unsuccessfully<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ */
+ private void Step2B() {
+ String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
+ "erons", "eront","erez", "èrent", "era", "ées", "iez",
+ "ée", "és", "er", "ez", "é" };
+ DeleteFrom( RV, suffix );
+
+ String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
+ "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
+ "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
+ DeleteButSuffixFrom( RV, search, "e", true );
+
+ DeleteFrom( R2, new String[] { "ions" } );
+ }
- /// <summary>
- /// Third step of the Porter Algorithmn<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- private void Step3()
+ /**
+ * Third step of the Porter Algorithm<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ */
+ private void Step3() {
+ if (sb.Length>0)
{
- if (sb.Length>0)
+ char ch = sb[ sb.Length -1];
+ if (ch == 'Y')
{
- char ch = sb[ sb.Length-1];
- if (ch == 'Y')
- {
- sb[ sb.Length-1] = 'i';
- SetStrings();
- }
- else if (ch == 'ç')
- {
- sb[ sb.Length-1] = 'c';
- SetStrings();
- }
+ sb[sb.Length -1] = 'i' ;
+ SetStrings();
}
- }
-
- /// <summary>
- /// Fourth step of the Porter Algorithmn<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- private void Step4()
- {
- if (sb.Length > 1)
+ else if (ch == 'ç')
{
- char ch = sb[sb.Length-1];
- if (ch == 's')
- {
- char b = sb[ sb.Length-2 ];
- if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
- {
- sb.Remove( sb.Length - 1, 1);
- SetStrings();
- }
- }
+ sb[sb.Length -1] = 'c';
+ SetStrings();
}
- bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
- if (!found)
- found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
-
- ReplaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
- DeleteFrom( RV, new String[] { "e" } );
- DeleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
}
+ }
- /// <summary>
- /// Fifth step of the Porter Algorithmn<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- private void Step5()
+ /**
+ * Fourth step of the Porter Algorithm<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ */
+ private void Step4() {
+ if (sb.Length > 1)
{
- if (R0 != null)
+ char ch = sb[ sb.Length -1];
+ if (ch == 's')
{
- if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill"))
+ char b = sb[ sb.Length -2];
+ if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
{
- sb.Remove( sb.Length - 1, 1);
+ sb.Remove( sb.Length - 1, sb.Length);
SetStrings();
}
}
}
+ bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
+ if (!found)
+ found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
+
+ ReplaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
+ DeleteFrom( RV, new String[] { "e" } );
+ DeleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
+ }
- /// <summary>
- /// Sixth (and last!) step of the Porter Algorithmn<br/>
- /// refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
- /// </summary>
- private void Step6()
- {
- if (R0!=null && R0.Length>0)
- {
- bool seenVowel = false;
- bool seenConson = false;
- int pos = -1;
- for (int i = R0.Length-1; i > -1; i--)
- {
- char ch = R0[i];
- if (IsVowel(ch))
- {
- if (!seenVowel)
- {
- if (ch == 'é' || ch == 'è')
- {
- pos = i;
- break;
- }
- }
- seenVowel = true;
- }
- else
- {
- if (seenVowel)
- break;
- else
- seenConson = true;
- }
- }
- if (pos > -1 && seenConson && !seenVowel)
- sb[pos] = 'e';
+ /**
+ * Fifth step of the Porter Algorithm<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ */
+ private void Step5() {
+ if (R0 != null)
+ {
+ if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill"))
+ {
+ sb.Remove( sb.Length - 1, sb.Length );
+ SetStrings();
}
}
+ }
- /// <summary>
- /// Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
- /// </summary>
- /// <param name="source">the primary source zone for search</param>
- /// <param name="search">the strings to search for suppression</param>
- /// <param name="from">the secondary source zone for search</param>
- /// <param name="prefix">the prefix to add to the search string to test</param>
- /// <returns>true if modified</returns>
- private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix )
+ /**
+ * Sixth (and last!) step of the Porter Algorithm<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+ */
+ private void Step6() {
+ if (R0!=null && R0.Length>0)
{
- bool found = false;
- if (source!=null )
+ bool seenVowel = false;
+ bool seenConson = false;
+ int pos = -1;
+ for (int i = R0.Length-1; i > -1; i--)
{
- for (int i = 0; i < search.Length; i++)
+ char ch = R0[i] ;
+ if (IsVowel(ch))
{
- if ( source.EndsWith( search[i] ))
+ if (!seenVowel)
{
- if (from!=null && from.EndsWith( prefix + search[i] ))
+ if (ch == 'é' || ch == 'è')
{
- sb.Remove( sb.Length - search[i].Length, search[i].Length);
- found = true;
- SetStrings();
+ pos = i;
break;
}
}
+ seenVowel = true;
}
- }
- return found;
- }
-
- /// <summary>
- /// Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
- /// </summary>
- /// <param name="source">the primary source zone for search</param>
- /// <param name="search">the strings to search for suppression</param>
- /// <param name="vowel">true if we need a vowel before the search string</param>
- /// <param name="from">the secondary source zone for search (where vowel could be)</param>
- /// <returns>true if modified</returns>
- private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from )
- {
- bool found = false;
- if (source!=null && from!=null)
- {
- for (int i = 0; i < search.Length; i++)
+ else
{
- if ( source.EndsWith( search[i] ))
- {
- if ((search[i].Length + 1) <= from.Length)
- {
- bool test = IsVowel(sb[sb.Length-(search[i].Length+1)]);
- if (test == vowel)
- {
- sb.Remove( sb.Length - search[i].Length, search[i].Length);
- modified = true;
- found = true;
- SetStrings();
- break;
- }
- }
- }
+ if (seenVowel)
+ break;
+ else
+ seenConson = true;
}
}
- return found;
+ if (pos > -1 && seenConson && !seenVowel)
+ sb[pos] = 'e';
}
+ }
- /// <summary>
- /// Delete a suffix searched in zone "source" if preceded by the prefix
- /// </summary>
- /// <param name="source">the primary source zone for search</param>
- /// <param name="search">the strings to search for suppression</param>
- /// <param name="prefix">the prefix to add to the search string to test</param>
- /// <param name="without">true if it will be deleted even without prefix found</param>
- private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without )
+ /**
+ * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
+ *
+ * @param source java.lang.String - the primary source zone for search
+ * @param search java.lang.String[] - the strings to search for suppression
+ * @param from java.lang.String - the secondary source zone for search
+ * @param prefix java.lang.String - the prefix to add to the search string to test
+ * @return bool - true if modified
+ */
+ private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
+ bool found = false;
+ if (source!=null )
{
- if (source!=null)
- {
- for (int i = 0; i < search.Length; i++)
+ for (int i = 0; i < search.Length; i++) {
+ if ( source.EndsWith( search[i] ))
{
- if ( source.EndsWith( prefix + search[i] ))
- {
- sb.Remove( sb.Length - (prefix.Length + search[i].Length), prefix.Length + search[i].Length);
- modified = true;
- SetStrings();
- break;
- }
- else if ( without && source.EndsWith( search[i] ))
+ if (from!=null && from.EndsWith( prefix + search[i] ))
{
- sb.Remove( sb.Length - search[i].Length, search[i].Length);
- modified = true;
+ sb.Remove( sb.Length - search[i].Length, sb.Length);
+ found = true;
SetStrings();
break;
}
}
}
}
+ return found;
+ }
- /// <summary>
- /// Delete a suffix searched in zone "source" if preceded by prefix<br/>
- /// or replace it with the replace string if preceded by the prefix in the zone "from"<br/>
- /// or delete the suffix if specified
- /// </summary>
- /// <param name="source">the primary source zone for search</param>
- /// <param name="search">the strings to search for suppression</param>
- /// <param name="prefix">the prefix to add to the search string to test</param>
- /// <param name="without">true if it will be deleted even without prefix found</param>
- private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace )
+ /**
+ * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
+ *
+ * @param source java.lang.String - the primary source zone for search
+ * @param search java.lang.String[] - the strings to search for suppression
+ * @param vowel bool - true if we need a vowel before the search string
+ * @param from java.lang.String - the secondary source zone for search (where vowel could be)
+ * @return bool - true if modified
+ */
+ private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from ) {
+ bool found = false;
+ if (source!=null && from!=null)
{
- if (source!=null)
- {
- for (int i = 0; i < search.Length; i++)
+ for (int i = 0; i < search.Length; i++) {
+ if ( source.EndsWith( search[i] ))
{
- if ( source.EndsWith( prefix + search[i] ))
+ if ((search[i].Length + 1) <= from.Length)
{
- sb.Remove( sb.Length - (prefix.Length + search[i].Length), prefix.Length + search[i].Length);
- modified = true;
- SetStrings();
- break;
- }
- else if ( from!=null && from.EndsWith( prefix + search[i] ))
- {
- sb.Remove(sb.Length - (prefix.Length + search[i].Length), prefix.Length + search[i].Length);
- sb.Append( replace );
- modified = true;
- SetStrings();
- break;
- }
- else if ( without && source.EndsWith( search[i] ))
- {
- sb.Remove( sb.Length - search[i].Length, search[i].Length );
- modified = true;
- SetStrings();
- break;
+ bool test = IsVowel(sb[sb.Length -(search[i].Length+1)]);
+ if (test == vowel)
+ {
+ sb.Remove( sb.Length - search[i].Length, sb.Length);
+ modified = true;
+ found = true;
+ SetStrings();
+ break;
+ }
}
}
}
}
+ return found;
+ }
- /// <summary>
- /// Replace a search string with another within the source zone
- /// </summary>
- /// <param name="source">the source zone for search</param>
- /// <param name="search">the strings to search for replacement</param>
- /// <param name="replace">the replacement string</param>
- /// <returns></returns>
- private bool ReplaceFrom( String source, String[] search, String replace )
+ /**
+ * Delete a suffix searched in zone "source" if preceded by the prefix
+ *
+ * @param source java.lang.String - the primary source zone for search
+ * @param search java.lang.String[] - the strings to search for suppression
+ * @param prefix java.lang.String - the prefix to add to the search string to test
+ * @param without bool - true if it will be deleted even without prefix found
+ */
+ private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without ) {
+ if (source!=null)
{
- bool found = false;
- if (source!=null)
- {
- for (int i = 0; i < search.Length; i++)
+ for (int i = 0; i < search.Length; i++) {
+ if ( source.EndsWith( prefix + search[i] ))
{
- if ( source.EndsWith( search[i] ))
- {
- sb.Remove(sb.Length - search[i].Length, search[i].Length);
- sb.Append( replace );
- modified = true;
- found = true;
- SetStrings();
- break;
- }
+ sb.Remove( sb.Length - (prefix.Length + search[i].Length), sb.Length );
+ modified = true;
+ SetStrings();
+ break;
+ }
+ else if ( without && source.EndsWith( search[i] ))
+ {
+ sb.Remove( sb.Length - search[i].Length, sb.Length );
+ modified = true;
+ SetStrings();
+ break;
}
}
- return found;
}
+ }
- /// <summary>
- /// Delete a search string within the source zone
- /// </summary>
- /// <param name="source">the source zone for search</param>
- /// <param name="suffix">the strings to search for suppression</param>
- private void DeleteFrom(String source, String[] suffix )
+ /**
+ * Delete a suffix searched in zone "source" if preceded by prefix<br>
+ * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
+ * or delete the suffix if specified
+ *
+ * @param source java.lang.String - the primary source zone for search
+ * @param search java.lang.String[] - the strings to search for suppression
+ * @param prefix java.lang.String - the prefix to add to the search string to test
+ * @param without bool - true if it will be deleted even without prefix found
+ */
+ private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace ) {
+ if (source!=null)
{
- if (source!=null)
- {
- for (int i = 0; i < suffix.Length; i++)
+ for (int i = 0; i < search.Length; i++) {
+ if ( source.EndsWith( prefix + search[i] ))
{
- if (source.EndsWith( suffix[i] ))
- {
- sb.Remove( sb.Length - suffix[i].Length, suffix[i].Length);
- modified = true;
- SetStrings();
- break;
- }
+ sb.Remove( sb.Length - (prefix.Length + search[i].Length), sb.Length );
+ modified = true;
+ SetStrings();
+ break;
+ }
+ else if ( from!=null && from.EndsWith( prefix + search[i] ))
+ {
+ sb.Remove(sb.Length - (prefix.Length + search[i].Length), sb.Length);
+ sb.Insert(sb.Length - (prefix.Length + search[i].Length), replace );
+ modified = true;
+ SetStrings();
+ break;
+ }
+ else if ( without && source.EndsWith( search[i] ))
+ {
+ sb.Remove( sb.Length - search[i].Length, sb.Length );
+ modified = true;
+ SetStrings();
+ break;
}
}
}
+ }
- /// <summary>
- /// Test if a char is a french vowel, including accentuated ones
- /// </summary>
- /// <param name="ch">the char to test</param>
- /// <returns>true if the char is a vowel</returns>
- private bool IsVowel(char ch)
- {
- switch (ch)
- {
- case 'a':
- case 'e':
- case 'i':
- case 'o':
- case 'u':
- case 'y':
- case 'â':
- case 'Ã ':
- case 'ë':
- case 'é':
- case 'ê':
- case 'è':
- case 'ï':
- case 'î':
- case 'ô':
- case 'ü':
- case 'ù':
- case 'û':
- return true;
- default:
- return false;
+ /**
+ * Replace a search string with another within the source zone
+ *
+ * @param source java.lang.String - the source zone for search
+ * @param search java.lang.String[] - the strings to search for replacement
+ * @param replace java.lang.String - the replacement string
+ */
+ private bool ReplaceFrom( String source, String[] search, String replace ) {
+ bool found = false;
+ if (source!=null)
+ {
+ for (int i = 0; i < search.Length; i++) {
+ if ( source.EndsWith( search[i] ))
+ {
+ sb.Remove(sb.Length - search[i].Length, sb.Length);
+ sb.Insert(sb.Length - search[i].Length, replace ); // replacement for java's replace
+ modified = true;
+ found = true;
+ SetStrings();
+ break;
+ }
}
}
+ return found;
+ }
- /// <summary>
- /// Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br/>
- /// "R is the region after the first non-vowel following a vowel
- /// or is the null region at the end of the word if there is no such non-vowel"<br/>
- /// </summary>
- /// <param name="buffer">the in buffer</param>
- /// <returns>the resulting string</returns>
- private String RetrieveR( StringBuilder buffer )
+ /**
+ * Delete a search string within the source zone
+ *
+ * @param source the source zone for search
+ * @param suffix the strings to search for suppression
+ */
+ private void DeleteFrom(String source, String[] suffix ) {
+ if (source!=null)
{
- int len = buffer.Length;
- int pos = -1;
- for (int c = 0; c < len; c++)
- {
- if (IsVowel( buffer[c]))
+ for (int i = 0; i < suffix.Length; i++) {
+ if (source.EndsWith( suffix[i] ))
{
- pos = c;
+ sb.Remove( sb.Length - suffix[i].Length, sb.Length);
+ modified = true;
+ SetStrings();
break;
}
}
- if (pos > -1)
+ }
+ }
+
+ /**
+ * Test if a char is a french vowel, including accentuated ones
+ *
+ * @param ch the char to test
+ * @return bool - true if the char is a vowel
+ */
+ private bool IsVowel(char ch) {
+ switch (ch)
+ {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ case 'y':
+ case 'â':
+ case 'Ã ':
+ case 'ë':
+ case 'é':
+ case 'ê':
+ case 'è':
+ case 'ï':
+ case 'î':
+ case 'ô':
+ case 'ü':
+ case 'ù':
+ case 'û':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
+ * "R is the region after the first non-vowel following a vowel
+ * or is the null region at the end of the word if there is no such non-vowel"<br>
+ * @param buffer java.lang.StringBuilder - the in buffer
+ * @return java.lang.String - the resulting string
+ */
+ private String RetrieveR( StringBuilder buffer ) {
+ int len = buffer.Length;
+ int pos = -1;
+ for (int c = 0; c < len; c++) {
+ if (IsVowel( buffer[ c ] ))
{
- int consonne = -1;
- for (int c = pos; c < len; c++)
+ pos = c;
+ break;
+ }
+ }
+ if (pos > -1)
+ {
+ int consonne = -1;
+ for (int c = pos; c < len; c++) {
+ if (!IsVowel(buffer[ c ] ))
{
- if (!IsVowel(buffer[c]))
- {
- consonne = c;
- break;
- }
+ consonne = c;
+ break;
}
- if (consonne > -1 && (consonne+1) < len)
- return buffer.ToString().Substring( consonne+1, len - (consonne+1) );
- else
- return null;
}
+ if (consonne > -1 && (consonne+1) < len)
+ return buffer.ToString().Substring( consonne+1, len );
else
return null;
}
+ else
+ return null;
+ }
- /// <summary>
- /// Retrieve the "RV zone" from a buffer an return the corresponding string<br/>
- /// "If the word begins with two vowels, RV is the region after the third letter,
- /// otherwise the region after the first vowel not at the beginning of the word,
- /// or the end of the word if these positions cannot be found."<br/>
- /// </summary>
- /// <param name="buffer">the in buffer</param>
- /// <returns>the resulting string</returns>
- private String RetrieveRV( StringBuilder buffer )
+ /**
+ * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
+ * "If the word begins with two vowels, RV is the region after the third letter,
+ * otherwise the region after the first vowel not at the beginning of the word,
+ * or the end of the word if these positions cannot be found."<br>
+ * @param buffer java.lang.StringBuilder - the in buffer
+ * @return java.lang.String - the resulting string
+ */
+ private String RetrieveRV( StringBuilder buffer ) {
+ int len = buffer.Length;
+ if ( buffer.Length > 3)
{
- int len = buffer.Length;
- if ( buffer.Length > 3)
+ if ( IsVowel(buffer[ 0 ] ) && IsVowel(buffer[ 1 ] )) {
+ return buffer.ToString().Substring(3,len);
+ }
+ else
{
- if ( IsVowel(buffer[0]) && IsVowel(buffer[1]))
- {
- return buffer.ToString().Substring(3,len-3);
- }
- else
- {
- int pos = 0;
- for (int c = 1; c < len; c++)
+ int pos = 0;
+ for (int c = 1; c < len; c++) {
+ if (IsVowel( buffer[ c ] ))
{
- if (IsVowel( buffer[c]))
- {
- pos = c;
- break;
- }
+ pos = c;
+ break;
}
- if ( pos+1 < len )
- return buffer.ToString().Substring( pos+1, len - (pos+1));
- else
- return null;
}
+ if ( pos+1 < len )
+ return buffer.ToString().Substring(pos+1, len );
+ else
+ return null;
}
- else
- return null;
}
+ else
+ return null;
+ }
- /// <summary>
- /// Turns u and i preceded AND followed by a vowel to UpperCase<br/>
- /// Turns y preceded OR followed by a vowel to UpperCase<br/>
- /// Turns u preceded by q to UpperCase<br/>
- /// </summary>
- /// <param name="buffer">the buffer to treat</param>
- /// <returns>the treated buffer</returns>
- private StringBuilder TreatVowels( StringBuilder buffer )
- {
- for ( int c = 0; c < buffer.Length; c++ )
- {
- char ch = buffer[c];
- if (c == 0) // first char
+ /**
+ * Turns u and i preceded AND followed by a vowel to UpperCase<br>
+ * Turns y preceded OR followed by a vowel to UpperCase<br>
+ * Turns u preceded by q to UpperCase<br>
+ *
+ * @param buffer java.util.StringBuilder - the buffer to treat
+ * @return java.util.StringBuilder - the treated buffer
+ */
+ private StringBuilder TreatVowels( StringBuilder buffer ) {
+ for ( int c = 0; c < buffer.Length; c++ ) {
+ char ch = buffer[ c ] ;
+
+ if (c == 0) // first char
+ {
+ if (buffer.Length>1)
{
- if (buffer.Length>1)
- {
- if (ch == 'y' && IsVowel(buffer[ c + 1 ]))
- buffer[c] = 'Y';
- }
+ if (ch == 'y' && IsVowel(buffer[ c + 1 ] ))
+ buffer[c] = 'Y';
}
- else if (c == buffer.Length-1) // last char
+ }
+ else if (c == buffer.Length-1) // last char
+ {
+ if (ch == 'u' && buffer[ c - 1 ] == 'q')
+ buffer[c] = 'U';
+ if (ch == 'y' && IsVowel(buffer[ c - 1 ] ))
+ buffer[c] = 'Y';
+ }
+ else // other cases
+ {
+ if (ch == 'u')
{
- if (ch == 'u' && buffer[c - 1] == 'q')
+ if (buffer[ c - 1] == 'q')
+ buffer[c] = 'U';
+ else if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))
buffer[c] = 'U';
- if (ch == 'y' && IsVowel(buffer[ c - 1 ]))
- buffer[c] = 'Y';
}
- else // other cases
+ if (ch == 'i')
{
- if (ch == 'u')
- {
- if (buffer[ c - 1] == 'q')
- buffer[ c ] = 'U';
- else if (IsVowel(buffer[c - 1]) && IsVowel(buffer[c + 1]))
- buffer[c] = 'U';
- }
- if (ch == 'i')
- {
- if (IsVowel(buffer[c - 1]) && IsVowel(buffer[ c + 1 ]))
- buffer[c] = 'I';
- }
- if (ch == 'y')
- {
- if (IsVowel(buffer[c - 1]) || IsVowel(buffer[c + 1]))
- buffer[c] = 'Y';
- }
+ if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))
+ buffer[c] = 'I';
+ }
+ if (ch == 'y')
+ {
+ if (IsVowel(buffer[ c - 1 ] ) || IsVowel(buffer[ c + 1 ] ))
+ buffer[c] = 'Y';
}
}
-
- return buffer;
}
- /// <summary>
- /// Checks a term if it can be processed correctly.
- /// </summary>
- /// <returns>true if, and only if, the given term consists in letters.</returns>
- private bool IsStemmable( String term )
- {
- bool upper = false;
- int first = -1;
- for ( int c = 0; c < term.Length; c++ )
- {
- // Discard terms that contain non-letter characters.
- if ( !Char.IsLetter( term[c] ) )
- {
+ return buffer;
+ }
+
+ /**
+ * Checks a term if it can be processed correctly.
+ *
+ * @return bool - true if, and only if, the given term consists in letters.
+ */
+ private bool IsStemmable( String term ) {
+ bool upper = false;
+ int first = -1;
+ for ( int c = 0; c < term.Length; c++ ) {
+ // Discard terms that contain non-letter chars.
+ if ( !char.IsLetter( term[c] ) ) {
+ return false;
+ }
+ // Discard terms that contain multiple uppercase letters.
+ if ( char.IsUpper( term[ c] ) ) {
+ if ( upper ) {
return false;
}
- // Discard terms that contain multiple uppercase letters.
- if ( Char.IsUpper( term[c] ) )
- {
- if ( upper )
- {
- return false;
- }
- // First encountered uppercase letter, set flag and save
- // position.
- else
- {
- first = c;
- upper = true;
- }
+ // First encountered uppercase letter, set flag and save
+ // position.
+ else {
+ first = c;
+ upper = true;
}
}
- // Discard the term if it contains a single uppercase letter that
- // is not starting the term.
- if ( first > 0 )
- {
- return false;
- }
- return true;
}
- }
+ // Discard the term if it contains a single uppercase letter that
+ // is not starting the term.
+ if ( first > 0 ) {
+ return false;
+ }
+ return true;
+ }
+}
+
}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -20,12 +20,14 @@ using Lucene.Net.Analysis;
namespace Lucene.Net.Analyzers.Miscellaneous
{
- public class EmptyTokenStream : TokenStream
+ /// <summary>
+ /// An always exhausted token stream
+ /// </summary>
+ public sealed class EmptyTokenStream : TokenStream
{
- [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
- public override Token Next(Token reusableToken)
+ public sealed override bool IncrementToken()
{
- return null;
+ return false;
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,510 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analyzers.Miscellaneous
+{
+ /**
+ * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
+ * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Regex}
+ * (with behaviour identical to {@link String#split(String)}),
+ * and that combines the functionality of
+ * {@link org.apache.lucene.analysis.LetterTokenizer},
+ * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
+ * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
+ * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
+ * multi-purpose class.
+ * <p>
+ * If you are unsure how exactly a regular expression should look like, consider
+ * prototyping by simply trying various expressions on some test texts via
+ * {@link String#split(String)}. Once you are satisfied, give that regex to
+ * RegexAnalyzer. Also see <a target="_blank"
+ * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
+ * <p>
+ * This class can be considerably faster than the "normal" Lucene tokenizers.
+ * It can also serve as a building block in a compound Lucene
+ * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
+ * stemming example:
+ * <pre>
+ * RegexAnalyzer pat = ...
+ * TokenStream tokenStream = new SnowballFilter(
+ * pat.tokenStream("content", "James is running round in the woods"),
+ * "English"));
+ * </pre>
+ *
+ */
+ public class RegexAnalyzer : Analyzer
+ {
+
+ /** <code>"\\W+"</code>; Divides text at non-letters (NOT char.IsLetter(c)) */
+ public static readonly Regex NON_WORD_Regex = new Regex("\\W+", RegexOptions.Compiled);
+
+ /** <code>"\\s+"</code>; Divides text at whitespaces (char.IsWhitespace(c)) */
+ public static readonly Regex WHITESPACE_Regex = new Regex("\\s+", RegexOptions.Compiled);
+
+ private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
+ CharArraySet.UnmodifiableSet(new CharArraySet(new[]{
+ "a", "about", "above", "across", "adj", "after", "afterwards",
+ "again", "against", "albeit", "all", "almost", "alone", "along",
+ "already", "also", "although", "always", "among", "amongst", "an",
+ "and", "another", "any", "anyhow", "anyone", "anything",
+ "anywhere", "are", "around", "as", "at", "be", "became", "because",
+ "become", "becomes", "becoming", "been", "before", "beforehand",
+ "behind", "being", "below", "beside", "besides", "between",
+ "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+ "down", "during", "each", "eg", "either", "else", "elsewhere",
+ "enough", "etc", "even", "ever", "every", "everyone", "everything",
+ "everywhere", "except", "few", "first", "for", "former",
+ "formerly", "from", "further", "had", "has", "have", "he", "hence",
+ "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+ "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+ "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+ "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+ "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+ "must", "my", "myself", "namely", "neither", "never",
+ "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+ "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+ "once one", "only", "onto", "or", "other", "others", "otherwise",
+ "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+ "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+ "several", "she", "should", "since", "so", "some", "somehow",
+ "someone", "something", "sometime", "sometimes", "somewhere",
+ "still", "such", "t", "than", "that", "the", "their", "them",
+ "themselves", "then", "thence", "there", "thereafter", "thereby",
+ "therefor", "therein", "thereupon", "these", "they", "this",
+ "those", "though", "through", "throughout", "thru", "thus", "to",
+ "together", "too", "toward", "towards", "under", "until", "up",
+ "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+ "whatever", "whatsoever", "when", "whence", "whenever",
+ "whensoever", "where", "whereafter", "whereas", "whereat",
+ "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+ "whereon", "whereto", "whereunto", "whereupon", "wherever",
+ "wherewith", "whether", "which", "whichever", "whichsoever",
+ "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+ "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+ "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+ "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+ "yourselves"
+ }, true));
+
+ /**
+ * A lower-casing word analyzer with English stop words (can be shared
+ * freely across threads without harm); global per class loader.
+ */
+ public static readonly RegexAnalyzer DEFAULT_ANALYZER = new RegexAnalyzer(
+ Version.LUCENE_CURRENT, NON_WORD_Regex, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+
+ /**
+ * A lower-casing word analyzer with <b>extended </b> English stop words
+ * (can be shared freely across threads without harm); global per class
+ * loader. The stop words are borrowed from
+ * http://thomas.loc.gov/home/stopwords.html, see
+ * http://thomas.loc.gov/home/all.about.inquery.html
+ */
+ public static readonly RegexAnalyzer EXTENDED_ANALYZER = new RegexAnalyzer(
+ Version.LUCENE_CURRENT, NON_WORD_Regex, true, EXTENDED_ENGLISH_STOP_WORDS);
+
+ private readonly Regex Regex;
+ private readonly bool toLowerCase;
+ private readonly ISet<string> stopWords;
+
+ private readonly Version matchVersion;
+
+ /**
+ * Constructs a new instance with the given parameters.
+ *
+ * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true
+ * @param Regex
+ * a regular expression delimiting tokens
+ * @param toLowerCase
+ * if <code>true</code> returns tokens after applying
+ * String.toLowerCase()
+ * @param stopWords
+ * if non-null, ignores all tokens that are contained in the
+ * given stop set (after previously having applied toLowerCase()
+ * if applicable). For example, created via
+ * {@link StopFilter#makeStopSet(String[])}and/or
+ * {@link org.apache.lucene.analysis.WordlistLoader}as in
+ * <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
+ * or <a href="http://www.unine.ch/info/clef/">other stop words
+ * lists </a>.
+ */
+ public RegexAnalyzer(Version matchVersion, Regex Regex, bool toLowerCase, ISet<string> stopWords)
+ {
+ if (Regex == null)
+ throw new ArgumentException("Regex must not be null");
+
+ if (EqRegex(NON_WORD_Regex, Regex)) Regex = NON_WORD_Regex;
+ else if (EqRegex(WHITESPACE_Regex, Regex)) Regex = WHITESPACE_Regex;
+
+ if (stopWords != null && stopWords.Count == 0) stopWords = null;
+
+ this.Regex = Regex;
+ this.toLowerCase = toLowerCase;
+ this.stopWords = stopWords;
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Creates a token stream that tokenizes the given string into token terms
+ * (aka words).
+ *
+ * @param fieldName
+ * the name of the field to tokenize (currently ignored).
+ * @param text
+ * the string to tokenize
+ * @return a new token stream
+ */
+ public TokenStream TokenStream(String fieldName, String text)
+ {
+ // Ideally the Analyzer superclass should have a method with the same signature,
+ // with a default impl that simply delegates to the StringReader flavour.
+ if (text == null)
+ throw new ArgumentException("text must not be null");
+
+ TokenStream stream;
+ if (Regex == NON_WORD_Regex)
+ { // fast path
+ stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
+ }
+ else if (Regex == WHITESPACE_Regex)
+ { // fast path
+ stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
+ }
+ else
+ {
+ stream = new RegexTokenizer(text, Regex, toLowerCase);
+ if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
+ }
+
+ return stream;
+ }
+
+ /**
+ * Creates a token stream that tokenizes all the text in the given Reader;
+ * This implementation forwards to <code>tokenStream(String, String)</code> and is
+ * less efficient than <code>tokenStream(String, String)</code>.
+ *
+ * @param fieldName
+ * the name of the field to tokenize (currently ignored).
+ * @param reader
+ * the reader delivering the text
+ * @return a new token stream
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ if (reader is FastStringReader)
+ { // fast path
+ return TokenStream(fieldName, ((FastStringReader)reader).GetString());
+ }
+
+ try
+ {
+ String text = ToString(reader);
+ return TokenStream(fieldName, text);
+ }
+ catch (IOException e)
+ {
+ throw new Exception("Wrapped Exception", e);
+ }
+ }
+
+ /**
+ * Indicates whether some other object is "equal to" this one.
+ *
+ * @param other
+ * the reference object with which to compare.
+ * @return true if equal, false otherwise
+ */
+ public override bool Equals(Object other)
+ {
+ if (this == other) return true;
+ if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
+ if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
+
+ if (other is RegexAnalyzer)
+ {
+ RegexAnalyzer p2 = (RegexAnalyzer)other;
+ return
+ toLowerCase == p2.toLowerCase &&
+ EqRegex(Regex, p2.Regex) &&
+ Eq(stopWords, p2.stopWords);
+ }
+ return false;
+ }
+
+ /**
+ * Returns a hash code value for the object.
+ *
+ * @return the hash code.
+ */
+ public override int GetHashCode()
+ {
+ if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
+ if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
+
+ int h = 1;
+ h = 31 * h + Regex.GetHashCode();
+ h = 31 * h + (int)Regex.Options;
+ h = 31 * h + (toLowerCase ? 1231 : 1237);
+ h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0);
+ return h;
+ }
+
+ /** equality where o1 and/or o2 can be null */
+ private static bool Eq(Object o1, Object o2)
+ {
+ return (o1 == o2) || (o1 != null ? o1.Equals(o2) : false);
+ }
+
+ /** assumes p1 and p2 are not null */
+ private static bool EqRegex(Regex p1, Regex p2)
+ {
+ return p1 == p2 || (p1.Options == p2.Options && p1.ToString() == p2.ToString());
+ }
+
+ /**
+ * Reads until end-of-stream and returns all read chars, finally closes the stream.
+ *
+ * @param input the input stream
+ * @throws IOException if an I/O error occurs while reading the stream
+ */
+ private static String ToString(TextReader input)
+ {
+ try
+ {
+ int len = 256;
+ char[] buffer = new char[len];
+ char[] output = new char[len];
+
+ len = 0;
+ int n;
+ while ((n = input.Read(buffer, 0, buffer.Length)) >= 0)
+ {
+ if (len + n > output.Length)
+ { // grow capacity
+ char[] tmp = new char[Math.Max(output.Length << 1, len + n)];
+ Array.Copy(output, 0, tmp, 0, len);
+ Array.Copy(buffer, 0, tmp, len, n);
+ buffer = output; // use larger buffer for future larger bulk reads
+ output = tmp;
+ }
+ else
+ {
+ Array.Copy(buffer, 0, output, len, n);
+ }
+ len += n;
+ }
+
+ return new String(output, 0, len);
+ }
+ finally
+ {
+ if (input != null) input.Dispose();
+ }
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Nested classes:
+ ///////////////////////////////////////////////////////////////////////////////
+ /**
+ * The work horse; performance isn't fantastic, but it's not nearly as bad
+ * as one might think - kudos to the Sun regex developers.
+ */
+ private sealed class RegexTokenizer : TokenStream
+ {
+
+ private readonly String str;
+ private readonly bool toLowerCase;
+ private Match matcher;
+ private int pos = 0;
+ private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ public RegexTokenizer(String str, Regex regex, bool toLowerCase)
+ {
+ this.str = str;
+ this.matcher = regex.Match(str);
+ this.toLowerCase = toLowerCase;
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (matcher == null) return false;
+ ClearAttributes();
+ while (true)
+ { // loop takes care of leading and trailing boundary cases
+ int start = pos;
+ int end;
+ bool isMatch = matcher.Success;
+ if (isMatch)
+ {
+ end = matcher.Index;
+ pos = matcher.Index + matcher.Length;
+ }
+ else
+ {
+ end = str.Length;
+ matcher = null; // we're finished
+ }
+
+ if (start != end)
+ { // non-empty match (header/trailer)
+ String text = str.Substring(start, end);
+ if (toLowerCase) text = text.ToLower(locale);
+ termAtt.SetTermBuffer(text);
+ offsetAtt.SetOffset(start, end);
+ return true;
+ }
+ if (!isMatch) return false;
+ matcher = matcher.NextMatch();
+ }
+ }
+
+ public override sealed void End()
+ {
+ // set final offset
+ int finalOffset = str.Length;
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Nested classes:
+ ///////////////////////////////////////////////////////////////////////////////
+ /**
+ * Special-case class for best performance in common cases; this class is
+ * otherwise unnecessary.
+ */
+ private sealed class FastStringTokenizer : TokenStream
+ {
+
+ private readonly String str;
+ private int pos;
+ private readonly bool isLetter;
+ private readonly bool toLowerCase;
+ private readonly ISet<string> stopWords;
+ private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords)
+ {
+ this.str = str;
+ this.isLetter = isLetter;
+ this.toLowerCase = toLowerCase;
+ this.stopWords = stopWords;
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ // cache loop instance vars (performance)
+ String s = str;
+ int len = s.Length;
+ int i = pos;
+ bool letter = isLetter;
+
+ int start = 0;
+ String text;
+ do
+ {
+ // find beginning of token
+ text = null;
+ while (i < len && !IsTokenChar(s[i], letter))
+ {
+ i++;
+ }
+
+ if (i < len)
+ { // found beginning; now find end of token
+ start = i;
+ while (i < len && IsTokenChar(s[i], letter))
+ {
+ i++;
+ }
+
+ text = s.Substring(start, i);
+ if (toLowerCase) text = text.ToLower(locale);
+ // if (toLowerCase) {
+ //// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
+ //// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
+ // text = s.substring(start, i).toLowerCase();
+ //// char[] chars = new char[i-start];
+ //// for (int j=start; j < i; j++) chars[j-start] = char.toLowerCase(s[j] );
+ //// text = new String(chars);
+ // } else {
+ // text = s.substring(start, i);
+ // }
+ }
+ } while (text != null && IsStopWord(text));
+
+ pos = i;
+ if (text == null)
+ {
+ return false;
+ }
+ termAtt.SetTermBuffer(text);
+ offsetAtt.SetOffset(start, i);
+ return true;
+ }
+
+ public override sealed void End()
+ {
+ // set final offset
+ int finalOffset = str.Length;
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ private bool IsTokenChar(char c, bool isLetter)
+ {
+ return isLetter ? char.IsLetter(c) : !char.IsWhiteSpace(c);
+ }
+
+ private bool IsStopWord(string text)
+ {
+ return stopWords != null && stopWords.Contains(text);
+ }
+
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Nested classes:
+ ///////////////////////////////////////////////////////////////////////////////
+ /**
+ * A StringReader that exposes it's contained string for fast direct access.
+ * Might make sense to generalize this to CharSequence and make it public?
+ */
+ sealed class FastStringReader : StringReader
+ {
+
+ private readonly string s;
+
+ FastStringReader(string s)
+ : base(s)
+ {
+ this.s = s;
+ }
+
+ internal string GetString()
+ {
+ return s;
+ }
+ }
+
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -64,26 +64,6 @@ namespace Lucene.Net.Analyzers.Miscellan
return _suffix.IncrementToken();
}
- /// <summary>
- /// @deprecated Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <param name="reusableToken"></param>
- /// <returns></returns>
- [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
- public override sealed Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /// <summary>
- /// @deprecated Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- [Obsolete("The returned Token is a \"full private copy\" (not re-used across calls to Next()) but will be slower than calling Next(Token) or using the new IncrementToken() method with the new AttributeSource API.")]
- public override sealed Token Next()
- {
- return base.Next();
- }
-
public override void Reset()
{
_suffix.Reset();
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs Mon Nov 21 04:44:55 2011
@@ -60,20 +60,20 @@ namespace Lucene.Net.Analyzers.Miscellan
_prefixExhausted = false;
// ReSharper disable DoNotCallOverridableMethodsInConstructor
- _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
- _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
- _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
- _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
- _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
- _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
+ _termAtt = AddAttribute<TermAttribute>();
+ _posIncrAtt = AddAttribute<PositionIncrementAttribute>();
+ _payloadAtt = AddAttribute<PayloadAttribute>();
+ _offsetAtt = AddAttribute<OffsetAttribute>();
+ _typeAtt = AddAttribute<TypeAttribute>();
+ _flagsAtt = AddAttribute<FlagsAttribute>();
// ReSharper restore DoNotCallOverridableMethodsInConstructor
- _pTermAtt = (TermAttribute) prefix.AddAttribute(typeof (TermAttribute));
- _pPosIncrAtt = (PositionIncrementAttribute) prefix.AddAttribute(typeof (PositionIncrementAttribute));
- _pPayloadAtt = (PayloadAttribute) prefix.AddAttribute(typeof (PayloadAttribute));
- _pOffsetAtt = (OffsetAttribute) prefix.AddAttribute(typeof (OffsetAttribute));
- _pTypeAtt = (TypeAttribute) prefix.AddAttribute(typeof (TypeAttribute));
- _pFlagsAtt = (FlagsAttribute) prefix.AddAttribute(typeof (FlagsAttribute));
+ _pTermAtt = prefix.AddAttribute<TermAttribute>();
+ _pPosIncrAtt = prefix.AddAttribute<PositionIncrementAttribute>();
+ _pPayloadAtt = prefix.AddAttribute<PayloadAttribute>();
+ _pOffsetAtt = prefix.AddAttribute<OffsetAttribute>();
+ _pTypeAtt = prefix.AddAttribute<TypeAttribute>();
+ _pFlagsAtt = prefix.AddAttribute<FlagsAttribute>();
}
public TokenStream Prefix { get; set; }
@@ -114,27 +114,6 @@ namespace Lucene.Net.Analyzers.Miscellan
return true;
}
- /// <summary>
- /// @deprecated Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <param name="reusableToken"></param>
- /// <returns></returns>
- [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
- public override sealed Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /// <summary>
- /// @deprecated Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <returns></returns>
- [Obsolete("The returned Token is a \"full private copy\" (not re-used across calls to Next()) but will be slower than calling Next(Token) or using the new IncrementToken() method with the new AttributeSource API.")]
- public override sealed Token Next()
- {
- return base.Next();
- }
-
private void SetCurrentToken(Token token)
{
if (token == null) return;
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -26,26 +26,23 @@ namespace Lucene.Net.Analyzers.Miscellan
/// <summary>
/// A TokenStream containing a single token.
/// </summary>
- public class SingleTokenTokenStream : TokenStream
+ public sealed class SingleTokenTokenStream : TokenStream
{
- private readonly AttributeImpl _tokenAtt;
private bool _exhausted;
// The token needs to be immutable, so work with clones!
private Token _singleToken;
+ private readonly AttributeImpl _tokenAtt;
public SingleTokenTokenStream(Token token)
+ : base(Token.TOKEN_ATTRIBUTE_FACTORY)
{
Debug.Assert(token != null, "Token was null!");
_singleToken = (Token) token.Clone();
- // ReSharper disable DoNotCallOverridableMethodsInConstructor
- _tokenAtt = (AttributeImpl) AddAttribute(typeof (TermAttribute));
- // ReSharper restore DoNotCallOverridableMethodsInConstructor
-
- Debug.Assert(_tokenAtt is Token || _tokenAtt.GetType().Name.Equals(typeof (TokenWrapper).Name),
- "Token Attribute is the wrong type! Type was: " + _tokenAtt.GetType().Name + " but expected " +
- typeof (TokenWrapper).Name);
+ _tokenAtt = (AttributeImpl)AddAttribute<TermAttribute>();
+
+ Debug.Assert(_tokenAtt is Token);
}
public override sealed bool IncrementToken()
@@ -60,29 +57,6 @@ namespace Lucene.Net.Analyzers.Miscellan
return true;
}
- /// <summary>
- /// @deprecated Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <param name="reusableToken"></param>
- /// <returns></returns>
- [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
- public override sealed Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /// <summary>
- /// @deprecated Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <returns></returns>
- [Obsolete(
- "The returned Token is a \"full private copy\" (not re-used across calls to Next()) but will be slower than calling Next(Token) or using the new IncrementToken() method with the new AttributeSource API."
- )]
- public override sealed Token Next()
- {
- return base.Next();
- }
-
public override void Reset()
{
_exhausted = false;
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -15,6 +15,7 @@
* limitations under the License.
*/
+using System;
using System.IO;
using System.Collections;
@@ -24,6 +25,45 @@ using Lucene.Net.Util;
namespace Lucene.Net.Analysis.NGram
{
+ public static class SideExtensions
+ {
+ public static string GetLabel(this Side theSide)
+ {
+ switch(theSide)
+ {
+ case Side.FRONT:
+ return "front";
+ case Side.BACK:
+ return "back";
+ default:
+ throw new ArgumentException(string.Format("{0} is not a valid value for EdgeNGramTokenFilter.Side", theSide));
+ }
+ }
+
+ public static Side GetSide(string sideName)
+ {
+ if (Side.FRONT.GetLabel() == sideName)
+ {
+ return Side.FRONT;
+ }
+
+ if (Side.BACK.GetLabel() == sideName)
+ {
+ return Side.BACK;
+ }
+
+ return (Side)(-1); // TODO: returning null instead of null? Should an exception be thrown instead?
+ }
+ }
+
+ /// <summary>
+ /// Specifies which side of the input the n-gram should be generated from
+ /// </summary>
+ public enum Side
+ {
+ FRONT,
+ BACK
+ }
/**
* Tokenizes the given token into n-grams of given size(s).
@@ -31,44 +71,12 @@ namespace Lucene.Net.Analysis.NGram
* This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
* </p>
*/
- public class EdgeNGramTokenFilter : TokenFilter
+ public sealed class EdgeNGramTokenFilter : TokenFilter
{
public static Side DEFAULT_SIDE = Side.FRONT;
public static int DEFAULT_MAX_GRAM_SIZE = 1;
public static int DEFAULT_MIN_GRAM_SIZE = 1;
- // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
- /** Specifies which side of the input the n-gram should be generated from */
- public class Side
- {
- private string label;
-
- /** Get the n-gram from the front of the input */
- public static Side FRONT = new Side("front");
-
- /** Get the n-gram from the end of the input */
- public static Side BACK = new Side("back");
-
- // Private ctor
- private Side(string label) { this.label = label; }
-
- public string getLabel() { return label; }
-
- // Get the appropriate Side from a string
- public static Side getSide(string sideName)
- {
- if (FRONT.getLabel().Equals(sideName))
- {
- return FRONT;
- }
- else if (BACK.getLabel().Equals(sideName))
- {
- return BACK;
- }
- return null;
- }
- }
-
private int minGram;
private int maxGram;
private Side side;
@@ -83,8 +91,8 @@ namespace Lucene.Net.Analysis.NGram
protected EdgeNGramTokenFilter(TokenStream input) : base(input)
{
- this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
- this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
}
/**
@@ -100,7 +108,7 @@ namespace Lucene.Net.Analysis.NGram
{
- if (side == null)
+ if (side != Side.FRONT && side != Side.BACK)
{
throw new System.ArgumentException("sideLabel must be either front or back");
}
@@ -118,8 +126,8 @@ namespace Lucene.Net.Analysis.NGram
this.minGram = minGram;
this.maxGram = maxGram;
this.side = side;
- this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
- this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
}
/**
@@ -131,9 +139,8 @@ namespace Lucene.Net.Analysis.NGram
* <param name="maxGram">the largest n-gram to generate</param>
*/
public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
- : this(input, Side.getSide(sideLabel), minGram, maxGram)
+ : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
{
-
}
public override bool IncrementToken()
@@ -173,22 +180,6 @@ namespace Lucene.Net.Analysis.NGram
}
}
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next()
- {
- return base.Next();
- }
-
public override void Reset()
{
base.Reset();