You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by di...@apache.org on 2011/05/15 19:51:58 UTC
[Lucene.Net] svn commit: r1103482 [1/2] - in
/incubator/lucene.net/branches/Lucene.Net_2_9_4g: src/contrib/Analyzers/
src/contrib/Analyzers/AR/ src/contrib/Analyzers/BR/
src/contrib/Analyzers/CJK/ src/contrib/Analyzers/Cz/
src/contrib/Analyzers/De/ src/contrib/Anal...
Author: digy
Date: Sun May 15 17:51:57 2011
New Revision: 1103482
URL: http://svn.apache.org/viewvc?rev=1103482&view=rev
Log:
[LUCENENET-405] contrib/Analysis.NGram
[LUCENENET-412] Required changes for Contrib/Analyzers (stopword lists changed from hashtable to ICollection<string> )
Added:
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenizer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/
incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs
Modified:
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/AR/ArabicAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianStemFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Contrib.Analyzers.csproj
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cz/CzechAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanStemFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/WordlistLoader.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchStemFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemFilter.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemmer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/WordlistLoader.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/WordlistLoader.cs
incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/AR/ArabicAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/AR/ArabicAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/AR/ArabicAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/AR/ArabicAnalyzer.cs Sun May 15 17:51:57 2011
@@ -17,6 +17,7 @@
using System.IO;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Util;
@@ -54,7 +55,7 @@ namespace Lucene.Net.Analysis.AR
/**
* Contains the stopwords used with the StopFilter.
*/
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
*/
@@ -84,7 +85,7 @@ namespace Lucene.Net.Analysis.AR
while (!reader.EndOfStream)
{
string word = reader.ReadLine();
- stoptable.Add(word, word);
+ stoptable.Add(word);
}
}
}
@@ -112,16 +113,16 @@ namespace Lucene.Net.Analysis.AR
*
* @deprecated Use {@link #ArabicAnalyzer(Version, Hashtable)} instead
*/
- public ArabicAnalyzer(Hashtable stopwords) : this(Version.LUCENE_24, stopwords)
+ public ArabicAnalyzer(ICollection<string> stopwords) : this(Version.LUCENE_24, stopwords)
{
}
/**
* Builds an analyzer with the given stop words.
*/
- public ArabicAnalyzer(Version matchVersion, Hashtable stopwords)
+ public ArabicAnalyzer(Version matchVersion, ICollection<string> stopwords)
{
- stoptable = new Hashtable(stopwords);
+ stoptable = new List<string>(stopwords);
this.matchVersion = matchVersion;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs Sun May 15 17:51:57 2011
@@ -16,6 +16,7 @@
*/
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
@@ -60,12 +61,12 @@ namespace Lucene.Net.Analysis.BR
/**
* Contains the stopwords used with the StopFilter.
*/
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/**
* Contains words that should be indexed but not stemmed.
*/
- private Hashtable excltable = new Hashtable();
+ private ICollection<string> excltable = new List<string>();
/**
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
@@ -86,7 +87,7 @@ namespace Lucene.Net.Analysis.BR
/**
* Builds an analyzer with the given stop words.
*/
- public BrazilianAnalyzer(Hashtable stopwords)
+ public BrazilianAnalyzer(ICollection<string> stopwords)
{
stoptable = stopwords;
}
@@ -109,7 +110,7 @@ namespace Lucene.Net.Analysis.BR
/**
* Builds an exclusionlist from a Hashtable.
*/
- public void SetStemExclusionTable(Hashtable exclusionlist)
+ public void SetStemExclusionTable(ICollection<string> exclusionlist)
{
excltable = exclusionlist;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianStemFilter.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/BR/BrazilianStemFilter.cs Sun May 15 17:51:57 2011
@@ -17,6 +17,7 @@
using Lucene.Net.Analysis;
using System.Collections;
+using System.Collections.Generic;
/**
@@ -33,7 +34,7 @@ namespace Lucene.Net.Analysis.BR
* The actual token in the input stream.
*/
private BrazilianStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private ICollection<string> exclusions = null;
public BrazilianStemFilter(TokenStream input)
: base(input)
@@ -41,7 +42,7 @@ namespace Lucene.Net.Analysis.BR
stemmer = new BrazilianStemmer();
}
- public BrazilianStemFilter(TokenStream input, Hashtable exclusiontable)
+ public BrazilianStemFilter(TokenStream input, ICollection<string> exclusiontable)
: this(input)
{
this.exclusions = exclusiontable;
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.CJK
@@ -91,7 +92,7 @@ namespace Lucene.Net.Analysis.CJK
/// <summary>
/// stop word list
/// </summary>
- private Hashtable stopTable;
+ private ICollection<string> stopTable;
//~ Constructors -----------------------------------------------------------
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Contrib.Analyzers.csproj Sun May 15 17:51:57 2011
@@ -9,7 +9,7 @@
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Lucene.Net.Analyzers</RootNamespace>
- <AssemblyName>Lucene.Net.Contrib.Analyzers</AssemblyName>
+ <AssemblyName>Lucene.Net.Analyzers</AssemblyName>
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<FileUpgradeFlags>
@@ -63,6 +63,10 @@
<Compile Include="Fr\FrenchAnalyzer.cs" />
<Compile Include="Fr\FrenchStemFilter.cs" />
<Compile Include="Fr\FrenchStemmer.cs" />
+ <Compile Include="NGram\EdgeNGramTokenFilter.cs" />
+ <Compile Include="NGram\EdgeNGramTokenizer.cs" />
+ <Compile Include="NGram\NGramTokenFilter.cs" />
+ <Compile Include="NGram\NGramTokenizer.cs" />
<Compile Include="Nl\DutchAnalyzer.cs" />
<Compile Include="Nl\DutchStemFilter.cs" />
<Compile Include="Nl\DutchStemmer.cs" />
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cz/CzechAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cz/CzechAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cz/CzechAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cz/CzechAnalyzer.cs Sun May 15 17:51:57 2011
@@ -2,6 +2,7 @@ using System;
using System.IO;
using System.Text;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.De;
@@ -102,7 +103,7 @@ namespace Lucene.Net.Analysis.Cz
/// <summary>
/// Contains the stopwords used with the StopFilter.
/// </summary>
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/// <summary>
/// Builds an analyzer.
@@ -123,7 +124,7 @@ namespace Lucene.Net.Analysis.Cz
/// <summary>
/// Builds an analyzer with the given stop words.
/// </summary>
- public CzechAnalyzer( Hashtable stopwords )
+ public CzechAnalyzer(ICollection<string> stopwords)
{
stoptable = stopwords;
}
@@ -145,13 +146,13 @@ namespace Lucene.Net.Analysis.Cz
{
if ( wordfile == null )
{
- stoptable = new Hashtable();
+ stoptable = new List<string>();
return;
}
try
{
// clear any previous table (if present)
- stoptable = new Hashtable();
+ stoptable = new List<string>();
StreamReader isr;
if (encoding == null)
@@ -162,7 +163,7 @@ namespace Lucene.Net.Analysis.Cz
String word;
while ( ( word = isr.ReadLine() ) != null )
{
- stoptable[word] = word;
+ stoptable.Add(word);
}
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanAnalyzer.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
@@ -37,12 +38,12 @@ namespace Lucene.Net.Analysis.De
/// <summary>
/// Contains the stopwords used with the StopFilter.
/// </summary>
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/// <summary>
/// Contains words that should be indexed but not stemmed.
/// </summary>
- private Hashtable excltable = new Hashtable();
+ private ICollection<string> excltable = new List<string>();
/// <summary>
/// Builds an analyzer.
@@ -65,7 +66,7 @@ namespace Lucene.Net.Analysis.De
/// Builds an analyzer with the given stop words.
/// </summary>
/// <param name="stopwords"></param>
- public GermanAnalyzer( Hashtable stopwords )
+ public GermanAnalyzer(ICollection<string> stopwords)
{
stoptable = stopwords;
}
@@ -92,7 +93,7 @@ namespace Lucene.Net.Analysis.De
/// Builds an exclusionlist from a Hashtable.
/// </summary>
/// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( Hashtable exclusionlist )
+ public void SetStemExclusionTable(ICollection<string> exclusionlist)
{
excltable = exclusionlist;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanStemFilter.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/GermanStemFilter.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.De
{
@@ -16,7 +17,7 @@ namespace Lucene.Net.Analysis.De
/// </summary>
private Token token = null;
private GermanStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private ICollection<string> exclusions = null;
public GermanStemFilter( TokenStream _in ) : base(_in)
{
@@ -28,7 +29,7 @@ namespace Lucene.Net.Analysis.De
/// </summary>
/// <param name="_in"></param>
/// <param name="exclusiontable"></param>
- public GermanStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
+ public GermanStemFilter(TokenStream _in, ICollection<string> exclusiontable) : this(_in)
{
exclusions = exclusiontable;
}
@@ -77,7 +78,7 @@ namespace Lucene.Net.Analysis.De
/// Set an alternative exclusion list for this filter.
/// </summary>
/// <param name="exclusiontable"></param>
- public void SetExclusionTable( Hashtable exclusiontable )
+ public void SetExclusionTable(ICollection<string> exclusiontable)
{
exclusions = exclusiontable;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/WordlistLoader.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/WordlistLoader.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/De/WordlistLoader.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.De
{
@@ -16,11 +17,11 @@ namespace Lucene.Net.Analysis.De
/// <param name="path">Path to the wordlist</param>
/// <param name="wordfile">Name of the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( String path, String wordfile )
+ public static ICollection<string> GetWordtable(String path, String wordfile)
{
if ( path == null || wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
return GetWordtable(new FileInfo(path + "\\" + wordfile));
}
@@ -29,11 +30,11 @@ namespace Lucene.Net.Analysis.De
/// </summary>
/// <param name="wordfile">Complete path to the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( String wordfile )
+ public static ICollection<string> GetWordtable(String wordfile)
{
if ( wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
return GetWordtable( new FileInfo( wordfile ) );
}
@@ -43,13 +44,14 @@ namespace Lucene.Net.Analysis.De
/// </summary>
/// <param name="wordfile">File containing the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( FileInfo wordfile )
+ public static ICollection<string> GetWordtable(FileInfo wordfile)
{
if ( wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
- Hashtable result = null;
+
+ ICollection<string> result = null;
try
{
StreamReader lnr = new StreamReader(wordfile.FullName);
@@ -72,7 +74,7 @@ namespace Lucene.Net.Analysis.De
// On error, use an empty table
catch (IOException)
{
- result = new Hashtable();
+ result = new List<string>();
}
return result;
}
@@ -83,12 +85,12 @@ namespace Lucene.Net.Analysis.De
/// <param name="words">Word that where read</param>
/// <param name="length">Amount of words that where read into <tt>words</tt></param>
/// <returns></returns>
- private static Hashtable MakeWordTable( String[] words, int length )
+ private static ICollection<string> MakeWordTable(String[] words, int length)
{
- Hashtable table = new Hashtable( length );
+ List<string> table = new List<string>( length );
for ( int i = 0; i < length; i++ )
{
- table.Add(words[i], words[i]);
+ table.Add(words[i]);
}
return table;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs Sun May 15 17:51:57 2011
@@ -2,6 +2,7 @@ using System;
using System.IO;
using System.Text;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.De;
@@ -108,12 +109,12 @@ namespace Lucene.Net.Analysis.Fr
/// <summary>
/// Contains the stopwords used with the StopFilter.
/// </summary>
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/// <summary>
/// Contains words that should be indexed but not stemmed.
/// </summary>
- private Hashtable excltable = new Hashtable();
+ private ICollection<string> excltable = new List<string>();
/// <summary>
/// Builds an analyzer.
@@ -134,7 +135,7 @@ namespace Lucene.Net.Analysis.Fr
/// <summary>
/// Builds an analyzer with the given stop words.
/// </summary>
- public FrenchAnalyzer( Hashtable stopwords )
+ public FrenchAnalyzer(ICollection<string> stopwords)
{
stoptable = stopwords;
}
@@ -158,7 +159,7 @@ namespace Lucene.Net.Analysis.Fr
/// <summary>
/// Builds an exclusionlist from a Hashtable.
/// </summary>
- public void SetStemExclusionTable( Hashtable exclusionlist )
+ public void SetStemExclusionTable(ICollection<string> exclusionlist)
{
excltable = exclusionlist;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchStemFilter.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Fr/FrenchStemFilter.cs Sun May 15 17:51:57 2011
@@ -2,6 +2,7 @@ using System;
using System.IO;
using System.Text;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
@@ -77,7 +78,7 @@ namespace Lucene.Net.Analysis.Fr
/// </summary>
private Token token = null;
private FrenchStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private ICollection<string> exclusions = null;
public FrenchStemFilter( TokenStream _in ) : base(_in)
{
@@ -87,7 +88,7 @@ namespace Lucene.Net.Analysis.Fr
/// <summary>
/// Builds a FrenchStemFilter that uses an exclusiontable.
/// </summary>
- public FrenchStemFilter( TokenStream _in, Hashtable exclusiontable ) : this( _in )
+ public FrenchStemFilter(TokenStream _in, ICollection<string> exclusiontable) : this(_in)
{
exclusions = exclusiontable;
}
@@ -135,7 +136,7 @@ namespace Lucene.Net.Analysis.Fr
/// <summary>
/// Set an alternative exclusion list for this filter.
/// </summary>
- public void SetExclusionTable( Hashtable exclusiontable )
+ public void SetExclusionTable(ICollection<string> exclusiontable)
{
exclusions = exclusiontable;
}
Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs?rev=1103482&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs Sun May 15 17:51:57 2011
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the given token into n-grams of given size(s).
+ * <p>
+ * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
+ * </p>
+ */
+ public class EdgeNGramTokenFilter : TokenFilter
+ {
+ public static Side DEFAULT_SIDE = Side.FRONT;
+ public static int DEFAULT_MAX_GRAM_SIZE = 1;
+ public static int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
+ /** Specifies which side of the input the n-gram should be generated from */
+ public class Side
+ {
+ private string label;
+
+ /** Get the n-gram from the front of the input */
+ public static Side FRONT = new Side("front");
+
+ /** Get the n-gram from the end of the input */
+ public static Side BACK = new Side("back");
+
+ // Private ctor
+ private Side(string label) { this.label = label; }
+
+ public string getLabel() { return label; }
+
+ // Get the appropriate Side from a string
+ public static Side getSide(string sideName)
+ {
+ if (FRONT.getLabel().Equals(sideName))
+ {
+ return FRONT;
+ }
+ else if (BACK.getLabel().Equals(sideName))
+ {
+ return BACK;
+ }
+ return null;
+ }
+ }
+
+ private int minGram;
+ private int maxGram;
+ private Side side;
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curGramSize;
+ private int tokStart;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+
+ protected EdgeNGramTokenFilter(TokenStream input) : base(input)
+ {
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /**
+ * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link TokenStream} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+
+
+ if (side == null)
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /**
+ * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link TokenStream} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
+ : this(input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+
+ }
+
+ public override bool IncrementToken()
+ {
+ while (true)
+ {
+ if (curTermBuffer == null)
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+ else
+ {
+ curTermBuffer = (char[])termAtt.TermBuffer().Clone();
+ curTermLength = termAtt.TermLength();
+ curGramSize = minGram;
+ tokStart = offsetAtt.StartOffset();
+ }
+ }
+ if (curGramSize <= maxGram)
+ {
+ if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
+ || curGramSize > maxGram))
+ { // if we have hit the end of our n-gram size range, quit
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
+ int end = start + curGramSize;
+ ClearAttributes();
+ offsetAtt.SetOffset(tokStart + start, tokStart + end);
+ termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
+ curGramSize++;
+ return true;
+ }
+ }
+ curTermBuffer = null;
+ }
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ curTermBuffer = null;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs?rev=1103482&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs Sun May 15 17:51:57 2011
@@ -0,0 +1,271 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the input from an edge into n-grams of given size(s).
+ * <p>
+ * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
+ * MaxGram can't be larger than 1024 because of limitation.
+ * </p>
+ */
+ public class EdgeNGramTokenizer : Tokenizer
+ {
+ public static Side DEFAULT_SIDE = Side.FRONT;
+ public static int DEFAULT_MAX_GRAM_SIZE = 1;
+ public static int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
+ /** Specifies which side of the input the n-gram should be generated from */
+ public class Side
+ {
+ private string label;
+
+ /** Get the n-gram from the front of the input */
+ public static Side FRONT = new Side("front");
+
+ /** Get the n-gram from the end of the input */
+ public static Side BACK = new Side("back");
+
+ // Private ctor
+ private Side(string label) { this.label = label; }
+
+
+ public string getLabel() { return label; }
+
+ // Get the appropriate Side from a string
+ public static Side getSide(string sideName)
+ {
+ if (FRONT.getLabel().Equals(sideName))
+ {
+ return FRONT;
+ }
+ else if (BACK.getLabel().Equals(sideName))
+ {
+ return BACK;
+ }
+ return null;
+ }
+ }
+
+ private int minGram;
+ private int maxGram;
+ private int gramSize;
+ private Side side;
+ private bool started = false;
+ private int inLen;
+ private string inStr;
+
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(TextReader input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeSource source, TextReader input, Side side, int minGram, int maxGram)
+ : base(source, input)
+ {
+
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
+ : base(factory, input)
+ {
+
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(TextReader input, string sideLabel, int minGram, int maxGram)
+ : this(input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeSource source, TextReader input, string sideLabel, int minGram, int maxGram)
+ : this(source, input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) :
+ this(factory, input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ private void init(Side side, int minGram, int maxGram)
+ {
+ if (side == null)
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ // if we are just starting, read the whole input
+ if (!started)
+ {
+ started = true;
+ char[] chars = new char[1024];
+ inStr = input.ReadToEnd().Trim(); // remove any leading or trailing spaces
+ inLen = inStr.Length;
+ gramSize = minGram;
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen)
+ {
+ return false;
+ }
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram)
+ {
+ return false;
+ }
+
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ termAtt.SetTermBuffer(inStr, start, gramSize);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+ gramSize++;
+ return true;
+ }
+
+ public override void End()
+ {
+ // set offset
+ int finalOffset = inLen;
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset(TextReader input)
+ {
+ base.Reset(input);
+ Reset();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenFilter.cs?rev=1103482&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenFilter.cs Sun May 15 17:51:57 2011
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the input into n-grams of the given size(s).
+ */
+ public class NGramTokenFilter : TokenFilter
+ {
+ public static int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public static int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private int minGram, maxGram;
+
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curGramSize;
+ private int curPos;
+ private int tokStart;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ /**
+ * Creates NGramTokenFilter with given min and max n-grams.
+ * @param input {@link TokenStream} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
+ : base(input)
+ {
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /**
+ * Creates NGramTokenFilter with default min and max n-grams.
+ * @param input {@link TokenStream} holding the input to be tokenized
+ */
+ public NGramTokenFilter(TokenStream input)
+ : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public override bool IncrementToken()
+ {
+ while (true)
+ {
+ if (curTermBuffer == null)
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+ else
+ {
+ curTermBuffer = (char[])termAtt.TermBuffer().Clone();
+ curTermLength = termAtt.TermLength();
+ curGramSize = minGram;
+ curPos = 0;
+ tokStart = offsetAtt.StartOffset();
+ }
+ }
+ while (curGramSize <= maxGram)
+ {
+ while (curPos + curGramSize <= curTermLength)
+ { // while there is input
+ ClearAttributes();
+ termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
+ offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+ curPos++;
+ return true;
+ }
+ curGramSize++; // increase n-gram size
+ curPos = 0;
+ }
+ curTermBuffer = null;
+ }
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ curTermBuffer = null;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenizer.cs?rev=1103482&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/NGram/NGramTokenizer.cs Sun May 15 17:51:57 2011
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the input into n-grams of the given size(s).
+ */
+ public class NGramTokenizer : Tokenizer
+ {
+ public static int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public static int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private int minGram, maxGram;
+ private int gramSize;
+ private int pos = 0;
+ private int inLen;
+ private string inStr;
+ private bool started = false;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(TextReader input, int minGram, int maxGram)
+ : base(input)
+ {
+ init(minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(AttributeSource source, TextReader input, int minGram, int maxGram)
+ : base(source, input)
+ {
+ init(minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
+ : base(factory, input)
+ {
+ init(minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with default min and max n-grams.
+ * @param input {@link Reader} holding the input to be tokenized
+ */
+ public NGramTokenizer(TextReader input)
+ : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+
+ }
+
+ private void init(int minGram, int maxGram)
+ {
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ if (!started)
+ {
+ started = true;
+ gramSize = minGram;
+ char[] chars = new char[1024];
+ inStr = input.ReadToEnd(); // remove any trailing empty strings
+ inLen = inStr.Length;
+ }
+
+ if (pos + gramSize > inLen)
+ { // if we hit the end of the string
+ pos = 0; // reset to beginning of string
+ gramSize++; // increase n-gram size
+ if (gramSize > maxGram) // we are done
+ return false;
+ if (pos + gramSize > inLen)
+ return false;
+ }
+
+ int oldPos = pos;
+ pos++;
+ termAtt.SetTermBuffer(inStr, oldPos, gramSize);
+ offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
+ return true;
+ }
+
+ public override void End()
+ {
+ // set offset
+ int finalOffset = inLen;
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset(TextReader input)
+ {
+ base.Reset(input);
+ Reset();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ pos = 0;
+ }
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchAnalyzer.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis.Standard;
namespace Lucene.Net.Analysis.Nl
@@ -94,14 +95,14 @@ namespace Lucene.Net.Analysis.Nl
/// <summary>
/// Contains the stopwords used with the StopFilter.
/// </summary>
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/// <summary>
/// Contains words that should be indexed but not stemmed.
/// </summary>
- private Hashtable excltable = new Hashtable();
+ private ICollection<string> excltable = new List<string>();
- private Hashtable _stemdict = new Hashtable();
+ private Dictionary<string,string> _stemdict = new Dictionary<string,string>();
/// <summary>
/// Builds an analyzer.
@@ -128,7 +129,7 @@ namespace Lucene.Net.Analysis.Nl
/// Builds an analyzer with the given stop words.
/// </summary>
/// <param name="stopwords"></param>
- public DutchAnalyzer( Hashtable stopwords )
+ public DutchAnalyzer(ICollection<string> stopwords)
{
stoptable = stopwords;
}
@@ -155,7 +156,7 @@ namespace Lucene.Net.Analysis.Nl
/// Builds an exclusionlist from a Hashtable.
/// </summary>
/// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( Hashtable exclusionlist )
+ public void SetStemExclusionTable(ICollection<string> exclusionlist)
{
excltable = exclusionlist;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemFilter.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemFilter.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.Nl
{
@@ -74,7 +75,7 @@ namespace Lucene.Net.Analysis.Nl
/// </summary>
private Token token = null;
private DutchStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private ICollection<string> exclusions = null;
public DutchStemFilter( TokenStream _in ) : base(_in)
{
@@ -86,7 +87,7 @@ namespace Lucene.Net.Analysis.Nl
/// </summary>
/// <param name="_in"></param>
/// <param name="exclusiontable"></param>
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
+ public DutchStemFilter(TokenStream _in, ICollection<string> exclusiontable) : this(_in)
{
exclusions = exclusiontable;
}
@@ -97,7 +98,7 @@ namespace Lucene.Net.Analysis.Nl
/// <param name="_in"></param>
/// <param name="exclusiontable"></param>
/// <param name="stemdictionary">Dictionary of word stem pairs, that overrule the algorithm</param>
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary): this(_in, exclusiontable)
+ public DutchStemFilter(TokenStream _in, ICollection<string> exclusiontable, Dictionary<string,string> stemdictionary) : this(_in, exclusiontable)
{
stemmer.SetStemDictionary(stemdictionary);
}
@@ -146,7 +147,7 @@ namespace Lucene.Net.Analysis.Nl
/// Set an alternative exclusion list for this filter.
/// </summary>
/// <param name="exclusiontable"></param>
- public void SetExclusionTable( Hashtable exclusiontable )
+ public void SetExclusionTable(ICollection<string> exclusiontable)
{
exclusions = exclusiontable;
}
@@ -156,7 +157,7 @@ namespace Lucene.Net.Analysis.Nl
/// so you can correct for a particular unwanted word-stem pair.
/// </summary>
/// <param name="dict"></param>
- public void SetStemDictionary(Hashtable dict)
+ public void SetStemDictionary(Dictionary<string,string> dict)
{
if (stemmer != null)
stemmer.SetStemDictionary(dict);
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemmer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemmer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/DutchStemmer.cs Sun May 15 17:51:57 2011
@@ -2,6 +2,7 @@ using System;
using System.IO;
using System.Text;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.Nl
{
@@ -75,7 +76,7 @@ namespace Lucene.Net.Analysis.Nl
/// </summary>
private StringBuilder sb = new StringBuilder();
private bool _removedE;
- private Hashtable _stemDict;
+ private Dictionary<string,string> _stemDict;
private int _R1;
@@ -92,7 +93,7 @@ namespace Lucene.Net.Analysis.Nl
term = term.ToLower();
if ( !IsStemmable( term ) )
return term;
- if (_stemDict != null && _stemDict.Contains(term))
+ if (_stemDict != null && _stemDict.ContainsKey(term))
return _stemDict[term] as string;
// Reset the StringBuilder.
sb.Remove(0, sb.Length);
@@ -477,7 +478,7 @@ namespace Lucene.Net.Analysis.Nl
return false;
}
- internal void SetStemDictionary(Hashtable dict)
+ internal void SetStemDictionary(Dictionary<string,string> dict)
{
_stemDict = dict;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/WordlistLoader.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/WordlistLoader.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Nl/WordlistLoader.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.Nl
{
@@ -72,21 +73,21 @@ namespace Lucene.Net.Analysis.Nl
/// <param name="path">Path to the wordlist</param>
/// <param name="wordfile">Name of the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( String path, String wordfile )
+ public static ICollection<string> GetWordtable(String path, String wordfile)
{
if ( path == null || wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
return GetWordtable(new FileInfo(path + "\\" + wordfile));
}
/// <param name="wordfile">Complete path to the wordlist</param>
- public static Hashtable GetWordtable( String wordfile )
+ public static ICollection<string> GetWordtable(String wordfile)
{
if ( wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
return GetWordtable( new FileInfo( wordfile ) );
}
@@ -98,13 +99,13 @@ namespace Lucene.Net.Analysis.Nl
/// </summary>
/// <param name="wordstemfile"></param>
/// <returns>Stem dictionary that overrules, the stemming algorithm</returns>
- public static Hashtable GetStemDict( FileInfo wordstemfile)
+ public static Dictionary<string,string> GetStemDict(FileInfo wordstemfile)
{
if ( wordstemfile == null )
{
- return new Hashtable();
+ return new Dictionary<string,string>();
}
- Hashtable result = new Hashtable();
+ Dictionary<string,string> result = new Dictionary<string,string>();
try
{
StreamReader lnr = new StreamReader(wordstemfile.FullName);
@@ -127,13 +128,13 @@ namespace Lucene.Net.Analysis.Nl
/// </summary>
/// <param name="wordfile">File containing the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( FileInfo wordfile )
+ public static ICollection<string> GetWordtable(FileInfo wordfile)
{
if ( wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
- Hashtable result = null;
+ ICollection<string> result = null;
try
{
StreamReader lnr = new StreamReader(wordfile.FullName);
@@ -156,7 +157,7 @@ namespace Lucene.Net.Analysis.Nl
// On error, use an empty table
catch (IOException)
{
- result = new Hashtable();
+ result = new List<string>();
}
return result;
}
@@ -167,12 +168,12 @@ namespace Lucene.Net.Analysis.Nl
/// <param name="words">Word that where read</param>
/// <param name="length">Amount of words that where read into <tt>words</tt></param>
/// <returns></returns>
- private static Hashtable MakeWordTable( String[] words, int length )
+ private static ICollection<string> MakeWordTable(String[] words, int length)
{
- Hashtable table = new Hashtable( length );
+ List<string> table = new List<string>(length);
for ( int i = 0; i < length; i++ )
{
- table.Add(words[i], words[i]);
+ table.Add(words[i]);
}
return table;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Ru/RussianAnalyzer.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Ru/RussianAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Ru/RussianAnalyzer.cs Sun May 15 17:51:57 2011
@@ -2,6 +2,7 @@ using System;
using System.Text;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.Ru
@@ -157,7 +158,7 @@ namespace Lucene.Net.Analysis.Ru
/// <summary>
/// Contains the stopwords used with the StopFilter.
/// </summary>
- private Hashtable stoptable = new Hashtable();
+ private ICollection<string> stoptable = new List<string>();
/// <summary>
/// Charset for Russian letters.
@@ -224,7 +225,7 @@ namespace Lucene.Net.Analysis.Ru
/// </summary>
/// <param name="charset"></param>
/// <param name="stopwords"></param>
- public RussianAnalyzer(char[] charset, Hashtable stopwords)
+ public RussianAnalyzer(char[] charset, ICollection<string> stopwords)
{
this.charset = charset;
stoptable = stopwords;
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/WordlistLoader.cs?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/WordlistLoader.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/WordlistLoader.cs Sun May 15 17:51:57 2011
@@ -1,6 +1,7 @@
using System;
using System.IO;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis
{
@@ -17,11 +18,11 @@ namespace Lucene.Net.Analysis
/// <param name="path">Path to the wordlist</param>
/// <param name="wordfile">Name of the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( String path, String wordfile )
+ public static ICollection<string> GetWordtable(String path, String wordfile)
{
if ( path == null || wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
return GetWordtable(new FileInfo(path + "\\" + wordfile));
}
@@ -31,11 +32,11 @@ namespace Lucene.Net.Analysis
/// </summary>
/// <param name="wordfile">Complete path to the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( String wordfile )
+ public static ICollection<string> GetWordtable(String wordfile)
{
if ( wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
return GetWordtable( new FileInfo( wordfile ) );
}
@@ -45,11 +46,11 @@ namespace Lucene.Net.Analysis
/// </summary>
/// <param name="wordfile">File containing the wordlist</param>
/// <returns></returns>
- public static Hashtable GetWordtable( FileInfo wordfile )
+ public static ICollection<string> GetWordtable( FileInfo wordfile )
{
if ( wordfile == null )
{
- return new Hashtable();
+ return new List<string>();
}
StreamReader lnr = new StreamReader(wordfile.FullName);
return GetWordtable(lnr);
@@ -63,23 +64,23 @@ namespace Lucene.Net.Analysis
/// </summary>
/// <param name="reader">Reader containing the wordlist</param>
/// <returns>A Hashtable with the reader's words</returns>
- public static Hashtable GetWordtable(TextReader reader)
+ public static ICollection<string> GetWordtable(TextReader reader)
{
- Hashtable result = new Hashtable();
+ ICollection<string> result = new List<string>();
try
{
- ArrayList stopWords = new ArrayList();
+ List<string> stopWords = new List<string>();
String word = null;
while ( ( word = reader.ReadLine() ) != null )
{
stopWords.Add(word.Trim());
}
- result = MakeWordTable( (String[])stopWords.ToArray(typeof(string)), stopWords.Count);
+ result = MakeWordTable(stopWords.ToArray(), stopWords.Count);
}
// On error, use an empty table
catch (IOException)
{
- result = new Hashtable();
+ result = new List<string>();
}
return result;
}
@@ -91,12 +92,12 @@ namespace Lucene.Net.Analysis
/// <param name="words">Word that where read</param>
/// <param name="length">Amount of words that where read into <tt>words</tt></param>
/// <returns></returns>
- private static Hashtable MakeWordTable( String[] words, int length )
+ private static ICollection<string> MakeWordTable( String[] words, int length )
{
- Hashtable table = new Hashtable( length );
+ List<string> table = new List<string>( length );
for ( int i = 0; i < length; i++ )
{
- table.Add(words[i], words[i]);
+ table.Add(words[i]);
}
return table;
}
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1103482&r1=1103481&r2=1103482&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Sun May 15 17:51:57 2011
@@ -9,7 +9,7 @@
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Lucene.Net.Analyzers</RootNamespace>
- <AssemblyName>Lucene.Net.Contrib.Analyzers.Test</AssemblyName>
+ <AssemblyName>Lucene.Net.Analyzers.Test</AssemblyName>
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<FileUpgradeFlags>
@@ -60,6 +60,10 @@
<Compile Include="AR\TestArabicAnalyzer.cs" />
<Compile Include="AR\TestArabicNormalizationFilter.cs" />
<Compile Include="AR\TestArabicStemFilter.cs" />
+ <Compile Include="NGram\TestEdgeNGramTokenFilter.cs" />
+ <Compile Include="NGram\TestEdgeNGramTokenizer.cs" />
+ <Compile Include="NGram\TestNGramTokenFilter.cs" />
+ <Compile Include="NGram\TestNGramTokenizer.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs?rev=1103482&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs Sun May 15 17:51:57 2011
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tests {@link EdgeNGramTokenFilter} for correctness.
+ */
+ [TestFixture]
+ public class TestEdgeNGramTokenFilter : BaseTokenStreamTestCase
+ {
+ private TokenStream input;
+
+ [SetUp]
+ public void SetUp()
+ {
+ base.SetUp();
+ input = new WhitespaceTokenizer(new StringReader("abcde"));
+ }
+
+ [Test]
+ public void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput3()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestFrontUnigram()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 });
+ }
+
+ [Test]
+ public void TestBackUnigram()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 });
+ }
+
+ [Test]
+ public void TestOversizedNgrams()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
+ AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+ }
+
+ [Test]
+ public void TestFrontRangeOfNgrams()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+ }
+
+ [Test]
+ public void TestBackRangeOfNgrams()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 });
+ }
+
+ [Test]
+ public void TestSmallTokenInStream()
+ {
+ input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+ tokenizer.Reset(new StringReader("abcde"));
+ AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs?rev=1103482&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs Sun May 15 17:51:57 2011
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tests {@link EdgeNGramTokenizer} for correctness.
+ */
+ [TestFixture]
+ public class TestEdgeNGramTokenizer : BaseTokenStreamTestCase
+ {
+ private StringReader input;
+
+ [SetUp]
+ public void SetUp()
+ {
+ base.SetUp();
+ input = new StringReader("abcde");
+ }
+
+ [Test]
+ public void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0, 0);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 2, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput3()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1, 2);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestFrontUnigram()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestBackUnigram()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestOversizedNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
+ AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestFrontRangeOfNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestBackRangeOfNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+ tokenizer.Reset(new StringReader("abcde"));
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+ }
+ }
+}
\ No newline at end of file