You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/02/28 23:43:28 UTC
[Lucene.Net] svn commit: r1294875 [2/45] - in /incubator/lucene.net/trunk: ./ build/
build/vs2010/contrib/ build/vs2010/test/ doc/ src/ src/contrib/Analyzers/
src/contrib/Analyzers/AR/ src/contrib/Analyzers/BR/
src/contrib/Analyzers/CJK/ src/contrib/Analyzers/Cn/ s...
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -15,11 +15,14 @@
* limitations under the License.
*/
+using System;
using System.Collections;
-
+using System.Collections.Generic;
+using System.Linq;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System.IO;
+using Version = Lucene.Net.Util.Version;
/**
* Analyzer for Brazilian language. Supports an external list of stopwords (words that
@@ -31,110 +34,216 @@ namespace Lucene.Net.Analysis.BR
{
public sealed class BrazilianAnalyzer : Analyzer
{
-
/**
* List of typical Brazilian stopwords.
*/
+ //TODO: Make this private in 3.1
public static string[] BRAZILIAN_STOP_WORDS = {
- "a","ainda","alem","ambas","ambos","antes",
- "ao","aonde","aos","apos","aquele","aqueles",
- "as","assim","com","como","contra","contudo",
- "cuja","cujas","cujo","cujos","da","das","de",
- "dela","dele","deles","demais","depois","desde",
- "desta","deste","dispoe","dispoem","diversa",
- "diversas","diversos","do","dos","durante","e",
- "ela","elas","ele","eles","em","entao","entre",
- "essa","essas","esse","esses","esta","estas",
- "este","estes","ha","isso","isto","logo","mais",
- "mas","mediante","menos","mesma","mesmas","mesmo",
- "mesmos","na","nas","nao","nas","nem","nesse","neste",
- "nos","o","os","ou","outra","outras","outro","outros",
- "pelas","pelas","pelo","pelos","perante","pois","por",
- "porque","portanto","proprio","propios","quais","qual",
- "qualquer","quando","quanto","que","quem","quer","se",
- "seja","sem","sendo","seu","seus","sob","sobre","sua",
- "suas","tal","tambem","teu","teus","toda","todas","todo",
- "todos","tua","tuas","tudo","um","uma","umas","uns"};
+ "a", "ainda", "alem", "ambas", "ambos", "antes",
+ "ao", "aonde", "aos", "apos", "aquele", "aqueles",
+ "as", "assim", "com", "como", "contra", "contudo",
+ "cuja", "cujas", "cujo", "cujos", "da", "das", "de",
+ "dela", "dele", "deles", "demais", "depois", "desde",
+ "desta", "deste", "dispoe", "dispoem", "diversa",
+ "diversas", "diversos", "do", "dos", "durante", "e",
+ "ela", "elas", "ele", "eles", "em", "entao", "entre",
+ "essa", "essas", "esse", "esses", "esta", "estas",
+ "este", "estes", "ha", "isso", "isto", "logo", "mais",
+ "mas", "mediante", "menos", "mesma", "mesmas", "mesmo",
+ "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste",
+ "nos", "o", "os", "ou", "outra", "outras", "outro", "outros",
+ "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por",
+ "porque", "portanto", "proprio", "propios", "quais", "qual",
+ "qualquer", "quando", "quanto", "que", "quem", "quer", "se",
+ "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua",
+ "suas", "tal", "tambem", "teu", "teus", "toda", "todas",
+ "todo",
+ "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns"
+ };
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop-words set.
+ /// </summary>
+ /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
+ public static ISet<string> GetDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private Hashtable stoptable = new Hashtable();
+ private static class DefaultSetHolder
+ {
+ internal static ISet<string> DEFAULT_STOP_SET =
+ CharArraySet.UnmodifiableSet(new CharArraySet(BRAZILIAN_STOP_WORDS, false));
+ }
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ private ISet<string> stoptable = new HashSet<string>();
+
+ private readonly Version matchVersion;
+
+ /// <summary>
+ /// Contains words that should be indexed but not stemmed.
+ // TODO: make this private in 3.1
+ /// </summary>
+ private ISet<string> excltable = new HashSet<string>();
+
+ public BrazilianAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
/**
- * Contains words that should be indexed but not stemmed.
- */
- private Hashtable excltable = new Hashtable();
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+
+ public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords)
+ {
+ stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ this.matchVersion = matchVersion;
+ }
/**
- * Builds an analyzer with the default stop words (<see cref="BRAZILIAN_STOP_WORDS"/>).
+ * Builds an analyzer with the given stop words and stemming exclusion words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
*/
- public BrazilianAnalyzer()
+
+ public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords,
+ ISet<string> stemExclusionSet)
+ : this(matchVersion, stopwords)
{
- stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS);
+
+ excltable = CharArraySet.UnmodifiableSet(CharArraySet
+ .Copy(stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
- public BrazilianAnalyzer(string[] stopwords)
+
+ public BrazilianAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
{
- stoptable = StopFilter.MakeStopSet(stopwords);
+
}
/**
- * Builds an analyzer with the given stop words.
- */
- public BrazilianAnalyzer(Hashtable stopwords)
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
+ */
+
+ public BrazilianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+ : this(matchVersion, stopwords.Keys.ToArray())
{
- stoptable = stopwords;
+
}
/**
- * Builds an analyzer with the given stop words.
- */
- public BrazilianAnalyzer(FileInfo stopwords)
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
+ */
+
+ public BrazilianAnalyzer(Version matchVersion, FileInfo stopwords)
+ : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
{
- stoptable = WordlistLoader.GetWordtable(stopwords);
}
/**
* Builds an exclusionlist from an array of Strings.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
- public void SetStemExclusionTable(string[] exclusionlist)
+
+ public void SetStemExclusionTable(params string[] exclusionlist)
{
excltable = StopFilter.MakeStopSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
}
+
/**
- * Builds an exclusionlist from a Hashtable.
+ * Builds an exclusionlist from a {@link Map}.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
- public void SetStemExclusionTable(Hashtable exclusionlist)
+
+ public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
{
- excltable = exclusionlist;
+ excltable = new HashSet<string>(exclusionlist.Keys);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
}
+
/**
* Builds an exclusionlist from the words contained in the given file.
+ * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
+
public void SetStemExclusionTable(FileInfo exclusionlist)
{
- excltable = WordlistLoader.GetWordtable(exclusionlist);
+ excltable = WordlistLoader.GetWordSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
- * <returns>A TokenStream build from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.</returns>
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
+ * {@link BrazilianStemFilter}.
*/
- public override TokenStream TokenStream(string fieldName, TextReader reader)
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
{
- TokenStream result = new StandardTokenizer(reader);
+ TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new LowerCaseFilter(result);
result = new StandardFilter(result);
- result = new StopFilter(result, stoptable);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stoptable);
result = new BrazilianStemFilter(result, excltable);
return result;
}
+
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
+ * {@link BrazilianStemFilter}.
+ */
+
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(matchVersion, reader);
+ streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new StandardFilter(streams.result);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stoptable);
+ streams.result = new BrazilianStemFilter(streams.result, excltable);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
}
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs Tue Feb 28 22:43:08 2012
@@ -15,8 +15,11 @@
* limitations under the License.
*/
+using System.Collections.Generic;
using Lucene.Net.Analysis;
using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
+using Version = Lucene.Net.Util.Version;
/**
@@ -33,15 +36,17 @@ namespace Lucene.Net.Analysis.BR
* The actual token in the input stream.
*/
private BrazilianStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private ISet<string> exclusions = null;
+ private TermAttribute termAtt;
public BrazilianStemFilter(TokenStream input)
: base(input)
{
- stemmer = new BrazilianStemmer();
+ stemmer = new BrazilianStemmer();
+ termAtt = AddAttribute<TermAttribute>();
}
- public BrazilianStemFilter(TokenStream input, Hashtable exclusiontable)
+ public BrazilianStemFilter(TokenStream input, ISet<string> exclusiontable)
: this(input)
{
this.exclusions = exclusiontable;
@@ -50,25 +55,25 @@ namespace Lucene.Net.Analysis.BR
/**
* <returns>Returns the next token in the stream, or null at EOS.</returns>
*/
- public override Token Next(Token reusableToken)
+ public override bool IncrementToken()
{
- System.Diagnostics.Trace.Assert(reusableToken != null);
-
- Token nextToken = input.Next(reusableToken);
- if (nextToken == null)
- return null;
-
- string term = nextToken.TermText();
-
- // Check the exclusion table.
- if (exclusions == null || !exclusions.Contains(term))
+ if (input.IncrementToken())
+ {
+ string term = termAtt.Term();
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.Contains(term))
+ {
+ string s = stemmer.Stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ }
+ return true;
+ }
+ else
{
- string s = stemmer.Stem(term);
- // If not stemmed, don't waste the time adjusting the token.
- if ((s != null) && !s.Equals(term))
- nextToken.SetTermBuffer(s.ToCharArray(), 0, s.Length);//was SetTermBuffer(s)
+ return false;
}
- return nextToken;
}
}
-}
\ No newline at end of file
+}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -20,130 +20,135 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.CJK
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- *
- * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
- */
-
- /// <summary>
- /// Filters CJKTokenizer with StopFilter.
- ///
- /// <author>Che, Dong</author>
- /// </summary>
- public class CJKAnalyzer : Analyzer
- {
- //~ Static fields/initializers ---------------------------------------------
-
- /// <summary>
- /// An array containing some common English words that are not usually
- /// useful for searching. and some double-byte interpunctions.....
- /// </summary>
- public static String[] stopWords =
- {
- "a", "and", "are", "as", "at", "be",
- "but", "by", "for", "if", "in",
- "into", "is", "it", "no", "not",
- "of", "on", "or", "s", "such", "t",
- "that", "the", "their", "then",
- "there", "these", "they", "this",
- "to", "was", "will", "with", "",
- "www"
- };
-
- //~ Instance fields --------------------------------------------------------
-
- /// <summary>
- /// stop word list
- /// </summary>
- private Hashtable stopTable;
-
- //~ Constructors -----------------------------------------------------------
-
- /// <summary>
- /// Builds an analyzer which removes words in STOP_WORDS.
- /// </summary>
- public CJKAnalyzer()
- {
- stopTable = StopFilter.MakeStopSet(stopWords);
- }
-
- /// <summary>
- /// Builds an analyzer which removes words in the provided array.
- /// </summary>
- /// <param name="stopWords">stop word array</param>
- public CJKAnalyzer(String[] stopWords)
- {
- stopTable = StopFilter.MakeStopSet(stopWords);
- }
-
- //~ Methods ----------------------------------------------------------------
-
- /// <summary>
- /// get token stream from input
- /// </summary>
- /// <param name="fieldName">lucene field name</param>
- /// <param name="reader">input reader</param>
- /// <returns>Token Stream</returns>
- public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
- {
- return new StopFilter(new CJKTokenizer(reader), stopTable);
- }
- }
+ /// <summary>
+ /// Filters CJKTokenizer with StopFilter.
+ ///
+ /// <author>Che, Dong</author>
+ /// </summary>
+ public class CJKAnalyzer : Analyzer
+ {
+ //~ Static fields/initializers ---------------------------------------------
+
+ /// <summary>
+ /// An array containing some common English words that are not usually
+ /// useful for searching. and some double-byte interpunctions.....
+ /// </summary>
+ // TODO make this final in 3.1 -
+ // this might be revised and merged with StopFilter stop words too
+ [Obsolete("use GetDefaultStopSet() instead")] public static String[] STOP_WORDS =
+ {
+ "a", "and", "are", "as", "at", "be",
+ "but", "by", "for", "if", "in",
+ "into", "is", "it", "no", "not",
+ "of", "on", "or", "s", "such", "t",
+ "that", "the", "their", "then",
+ "there", "these", "they", "this",
+ "to", "was", "will", "with", "",
+ "www"
+ };
+
+ //~ Instance fields --------------------------------------------------------
+
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop-words set.
+ /// </summary>
+ /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
+ public static ISet<string> GetDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ private static class DefaultSetHolder
+ {
+ internal static ISet<string> DEFAULT_STOP_SET =
+ CharArraySet.UnmodifiableSet(new CharArraySet(STOP_WORDS, false));
+ }
+
+ /// <summary>
+ /// stop word list
+ /// </summary>
+ private ISet<string> stopTable;
+
+ private readonly Version matchVersion;
+
+ //~ Constructors -----------------------------------------------------------
+
+ public CJKAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+
+ }
+
+ public CJKAnalyzer(Version matchVersion, ISet<string> stopWords)
+ {
+ stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords));
+ this.matchVersion = matchVersion;
+ }
+
+ /// <summary>
+ /// Builds an analyzer which removes words in the provided array.
+ /// </summary>
+ /// <param name="stopWords">stop word array</param>
+ public CJKAnalyzer(Version matchVersion, params string[] stopWords)
+ {
+ stopTable = StopFilter.MakeStopSet(stopWords);
+ this.matchVersion = matchVersion;
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /// <summary>
+ /// get token stream from input
+ /// </summary>
+ /// <param name="fieldName">lucene field name</param>
+ /// <param name="reader">input reader</param>
+ /// <returns>Token Stream</returns>
+ public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ return new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ new CJKTokenizer(reader), stopTable);
+ }
+
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @param fieldName lucene field name
+ * @param reader Input {@link Reader}
+ * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
+ * {@link StopFilter}
+ */
+ public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ /* tokenStream() is final, no back compat issue */
+ SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new CJKTokenizer(reader);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.source, stopTable);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs Tue Feb 28 22:43:08 2012
@@ -20,331 +20,380 @@
*/
using System;
+using System.Globalization;
using System.IO;
using System.Text;
+using System.Text.RegularExpressions;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
namespace Lucene.Net.Analysis.CJK
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// <p>
- /// CJKTokenizer was modified from StopTokenizer which does a decent job for
- /// most European languages. and it perferm other token method for double-byte
- /// Characters: the token will return at each two charactors with overlap match.<br/>
- /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
- /// also need filter filter zero length token ""<br/>
- /// for Digit: digit, '+', '#' will token as letter<br/>
- /// for more info on Asia language(Chinese Japanese Korean) text segmentation:
- /// please search <a
- /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
- /// </p>
- ///
- /// @author Che, Dong
- /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
- /// </summary>
- public sealed class CJKTokenizer : Tokenizer
- {
- //~ Static fields/initializers ---------------------------------------------
-
- /// <summary>
- /// Max word length
- /// </summary>
- private static int MAX_WORD_LEN = 255;
-
- /// <summary>
- /// buffer size
- /// </summary>
- private static int IO_BUFFER_SIZE = 256;
-
- //~ Instance fields --------------------------------------------------------
-
- /// <summary>
- /// word offset, used to imply which character(in ) is parsed
- /// </summary>
- private int offset = 0;
-
- /// <summary>
- /// the index used only for ioBuffer
- /// </summary>
- private int bufferIndex = 0;
-
- /// <summary>
- /// data length
- /// </summary>
- private int dataLen = 0;
-
- /// <summary>
- /// character buffer, store the characters which are used to compose <br/>
- /// the returned Token
- /// </summary>
- private char[] buffer = new char[MAX_WORD_LEN];
-
- /// <summary>
- /// I/O buffer, used to store the content of the input(one of the <br/>
- /// members of Tokenizer)
- /// </summary>
- private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
- /// <summary>
- /// word type: single=>ASCII double=>non-ASCII word=>default
- /// </summary>
- private String tokenType = "word";
-
- /// <summary>
- /// tag: previous character is a cached double-byte character "C1C2C3C4"
- /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
- /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
- /// </summary>
- private bool preIsTokened = false;
-
- //~ Constructors -----------------------------------------------------------
-
- /// <summary>
- /// Construct a token stream processing the given input.
- /// </summary>
- /// <param name="_in">I/O reader</param>
- public CJKTokenizer(TextReader _in)
- {
- input = _in;
- }
-
- //~ Methods ----------------------------------------------------------------
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS.
- /// </summary>
- /// <returns>Token</returns>
- public override Token Next()
- {
- /** how many character(s) has been stored in buffer */
- int length = 0;
-
- /** the position used to create Token */
- int start = offset;
-
- while (true)
- {
- /** current charactor */
- char c;
-
- /** unicode block of current charactor for detail */
- //Character.UnicodeBlock ub;
-
- offset++;
-
- if (bufferIndex >= dataLen)
- {
- dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
- bufferIndex = 0;
- }
-
- if (dataLen == 0)
- {
- if (length > 0)
- {
- if (preIsTokened == true)
- {
- length = 0;
- preIsTokened = false;
- }
-
- break;
- }
- else
- {
- return null;
- }
- }
- else
- {
- //get current character
- c = ioBuffer[bufferIndex++];
-
- //get the UnicodeBlock of the current character
- //ub = Character.UnicodeBlock.of(c);
- }
-
- //if the current character is ASCII or Extend ASCII
- if (('\u0000' <= c && c <= '\u007F') ||
- ('\uFF00' <= c && c <= '\uFFEF'))
- {
- if ('\uFF00' <= c && c <= '\uFFEF')
- {
- /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
- int i = (int) c;
- i = i - 65248;
- c = (char) i;
- }
-
- // if the current character is a letter or "_" "+" "#"
- if (Char.IsLetterOrDigit(c)
- || ((c == '_') || (c == '+') || (c == '#'))
- )
- {
- if (length == 0)
- {
- // "javaC1C2C3C4linux" <br/>
- // ^--: the current character begin to token the ASCII
- // letter
- start = offset - 1;
- }
- else if (tokenType == "double")
- {
- // "javaC1C2C3C4linux" <br/>
- // ^--: the previous non-ASCII
- // : the current character
- offset--;
- bufferIndex--;
- tokenType = "single";
-
- if (preIsTokened == true)
- {
- // there is only one non-ASCII has been stored
- length = 0;
- preIsTokened = false;
-
- break;
- }
- else
- {
- break;
- }
- }
-
- // store the LowerCase(c) in the buffer
- buffer[length++] = Char.ToLower(c);
- tokenType = "single";
-
- // break the procedure if buffer overflowed!
- if (length == MAX_WORD_LEN)
- {
- break;
- }
- }
- else if (length > 0)
- {
- if (preIsTokened == true)
- {
- length = 0;
- preIsTokened = false;
- }
- else
- {
- break;
- }
- }
- }
- else
- {
- // non-ASCII letter, eg."C1C2C3C4"
- if (Char.IsLetter(c))
- {
- if (length == 0)
- {
- start = offset - 1;
- buffer[length++] = c;
- tokenType = "double";
- }
- else
- {
- if (tokenType == "single")
- {
- offset--;
- bufferIndex--;
-
- //return the previous ASCII characters
- break;
- }
- else
- {
- buffer[length++] = c;
- tokenType = "double";
-
- if (length == 2)
- {
- offset--;
- bufferIndex--;
- preIsTokened = true;
-
- break;
- }
- }
- }
- }
- else if (length > 0)
- {
- if (preIsTokened == true)
- {
- // empty the buffer
- length = 0;
- preIsTokened = false;
- }
- else
- {
- break;
- }
- }
- }
- }
-
- return new Token(new String(buffer, 0, length), start, start + length,
- tokenType
- );
- }
- }
-
+ /// <summary>
+ /// <p>
+ /// CJKTokenizer was modified from StopTokenizer which does a decent job for
+ /// most European languages. and it perferm other token method for double-byte
+ /// chars: the token will return at each two charactors with overlap match.<br/>
+ /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+ /// also need filter filter zero length token ""<br/>
+ /// for Digit: digit, '+', '#' will token as letter<br/>
+ /// for more info on Asia language(Chinese Japanese Korean) text segmentation:
+ /// please search <a
+ /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ /// </p>
+ ///
+ /// @author Che, Dong
+ /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
+ /// </summary>
+ public sealed class CJKTokenizer : Tokenizer
+ {
+ //~ Static fields/initializers ---------------------------------------------
+ /// <summary>
+ /// Word token type
+ /// </summary>
+ internal static readonly int WORD_TYPE = 0;
+
+ /// <summary>
+ /// Single byte token type
+ /// </summary>
+ internal static readonly int SINGLE_TOKEN_TYPE = 1;
+
+ /// <summary>
+ /// Double byte token type
+ /// </summary>
+ internal static readonly int DOUBLE_TOKEN_TYPE = 2;
+
+ /// <summary>
+ /// Names for token types
+ /// </summary>
+ internal static readonly String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+
+ /// <summary>
+ /// Max word length
+ /// </summary>
+ internal static readonly int MAX_WORD_LEN = 255;
+
+ /// <summary>
+ /// buffer size
+ /// </summary>
+ internal static readonly int IO_BUFFER_SIZE = 256;
+
+ //~ Instance fields --------------------------------------------------------
+
+ /// <summary>
+ /// word offset, used to imply which character(in ) is parsed
+ /// </summary>
+ private int offset = 0;
+
+ /// <summary>
+ /// the index used only for ioBuffer
+ /// </summary>
+ private int bufferIndex = 0;
+
+ /// <summary>
+ /// data length
+ /// </summary>
+ private int dataLen = 0;
+
+ /// <summary>
+ /// character buffer, store the characters which are used to compose <br/>
+ /// the returned Token
+ /// </summary>
+ private char[] buffer = new char[MAX_WORD_LEN];
+
+ /// <summary>
+ /// I/O buffer, used to store the content of the input(one of the <br/>
+ /// members of Tokenizer)
+ /// </summary>
+ private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /// <summary>
+ /// word type: single=>ASCII double=>non-ASCII word=>default
+ /// </summary>
+ private int tokenType = WORD_TYPE;
+
+ /// <summary>
+ /// tag: previous character is a cached double-byte character "C1C2C3C4"
+ /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+ /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+ /// </summary>
+ private bool preIsTokened = false;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
+
+ //~ Constructors -----------------------------------------------------------
+
+ /// <summary>
+ /// Construct a token stream processing the given input.
+ /// </summary>
+ /// <param name="_in">I/O reader</param>
+ public CJKTokenizer(TextReader _in)
+ : base(_in)
+ {
+ Init();
+ }
+
+ public CJKTokenizer(AttributeSource source, TextReader _in)
+ : base(source, _in)
+ {
+ Init();
+ }
+
+ public CJKTokenizer(AttributeFactory factory, TextReader _in)
+ : base(factory, _in)
+ {
+ Init();
+ }
+
+ private void Init()
+ {
+ termAtt = AddAttribute<TermAttribute>();
+ offsetAtt = AddAttribute<OffsetAttribute>();
+ typeAtt = AddAttribute<TypeAttribute>();
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /**
+ * Returns true for the next token in the stream, or false at EOS.
+ * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html
+ * for detail.
+ *
+ * @return false for end of stream, true otherwise
+ *
+ * @throws java.io.IOException - throw IOException when read error <br>
+ * happened in the InputStream
+ *
+ */
+
+ Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled);
+ Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled);
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ /** how many character(s) has been stored in buffer */
+
+ while (true)
+ {
+ // loop until we find a non-empty token
+
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
+
+ while (true)
+ {
+ // loop until we've found a full token
+ /** current character */
+ char c;
+
+ offset++;
+
+ if (bufferIndex >= dataLen)
+ {
+ dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java
+ {
+ if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ offset--;
+ }
+
+ break;
+ }
+ else
+ {
+ offset--;
+ return false;
+ }
+ }
+ else
+ {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+ }
+
+ //TODO: Using a Regex to determine the UnicodeCategory is probably slower than
+ // If we just created a small class that would look it up for us, which
+ // would likely be trivial, however time-consuming. I can't imagine a Regex
+ // being fast for this, considering we have to pull a char from the buffer,
+ // and convert it to a string before we run a regex on it. - cc
+ bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success;
+ //if the current character is ASCII or Extend ASCII
+ if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm))
+ {
+ if (isHalfFullForm)
+ {
+ int i = (int) c;
+ if (i >= 65281 && i <= 65374)
+ {
+ // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+ i = i - 65248;
+ c = (char) i;
+ }
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (char.IsLetterOrDigit(c)
+ || ((c == '_') || (c == '+') || (c == '#'))
+ )
+ {
+ if (length == 0)
+ {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ }
+ else if (tokenType == DOUBLE_TOKEN_TYPE)
+ {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+
+ if (preIsTokened == true)
+ {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+ break;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = char.ToLower(c); // TODO: is java invariant? If so, this should be ToLowerInvariant()
+ tokenType = SINGLE_TOKEN_TYPE;
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN)
+ {
+ break;
+ }
+ }
+ else if (length > 0)
+ {
+ if (preIsTokened)
+ {
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ else
+ {
+ // non-ASCII letter, e.g."C1C2C3C4"
+ if (char.IsLetter(c))
+ {
+ if (length == 0)
+ {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+ }
+ else
+ {
+ if (tokenType == SINGLE_TOKEN_TYPE)
+ {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ }
+ else
+ {
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+
+ if (length == 2)
+ {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ }
+ else if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ }
+
+ if (length > 0)
+ {
+ termAtt.SetTermBuffer(buffer, 0, length);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+ typeAtt.SetType(TOKEN_TYPE_NAMES[tokenType]);
+ return true;
+ }
+ else if (dataLen == 0)
+ {
+ offset--;
+ return false;
+ }
+
+ // Cycle back and try for the next token (don't
+ // return an empty string)
+ }
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(offset);
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ offset = bufferIndex = dataLen = 0;
+ preIsTokened = false;
+ tokenType = WORD_TYPE;
+ }
+
+ public override void Reset(TextReader reader)
+ {
+ base.Reset(reader);
+ Reset();
+ }
+ }
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -28,86 +28,58 @@ using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.Cn
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
+ /// <summary>
+ /// An <see cref="Analyzer"/> that tokenizes text with <see cref="ChineseTokenizer"/> and
+ /// filters with <see cref="ChineseFilter"/>
+ /// </summary>
+ public class ChineseAnalyzer : Analyzer
+ {
- /// <summary>
- /// Title: ChineseAnalyzer
- /// Description:
- /// Subclass of org.apache.lucene.analysis.Analyzer
- /// build from a ChineseTokenizer, filtered with ChineseFilter.
- /// Copyright: Copyright (c) 2001
- /// Company:
- /// <author>Yiyi Sun</author>
- /// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
- /// </summary>
- public class ChineseAnalyzer : Analyzer
- {
+ public ChineseAnalyzer()
+ {
+ }
- public ChineseAnalyzer()
- {
- }
+ /// <summary>
+ /// Creates a TokenStream which tokenizes all the text in the provided Reader.
+ /// </summary>
+ /// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
+ public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new ChineseTokenizer(reader);
+ result = new ChineseFilter(result);
+ return result;
+ }
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided Reader.
- /// </summary>
- /// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
- public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new ChineseTokenizer(reader);
- result = new ChineseFilter(result);
- return result;
- }
- }
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /// <summary>
+ /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the text in the
+ /// provided <see cref="TextReader"/>.
+ /// </summary>
+ /// <returns>
+ /// A <see cref="TokenStream"/> built from a <see cref="ChineseTokenizer"/>
+ /// filtered with <see cref="ChineseFilter"/>.
+ /// </returns>
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ /* tokenStream() is final, no back compat issue */
+ SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new ChineseTokenizer(reader);
+ streams.result = new ChineseFilter(streams.source);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs Tue Feb 28 22:43:08 2012
@@ -25,135 +25,75 @@ using System.Collections;
using System.Globalization;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Cn
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// Title: ChineseFilter
- /// Description: Filter with a stop word table
- /// Rule: No digital is allowed.
- /// English word/token should larger than 1 character.
- /// One Chinese character as one Chinese word.
- /// TO DO:
- /// 1. Add Chinese stop words, such as \ue400
- /// 2. Dictionary based Chinese word extraction
- /// 3. Intelligent Chinese word extraction
- ///
- /// Copyright: Copyright (c) 2001
- /// Company:
- /// <author>Yiyi Sun</author>
- /// <version>$Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $</version>
- /// </summary>
- public sealed class ChineseFilter : TokenFilter
- {
- // Only English now, Chinese to be added later.
- public static String[] STOP_WORDS =
- {
- "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it",
- "no", "not", "of", "on", "or", "such",
- "that", "the", "their", "then", "there", "these",
- "they", "this", "to", "was", "will", "with"
- };
-
- private Hashtable stopTable;
-
- public ChineseFilter(TokenStream _in) : base (_in)
- {
- stopTable = new Hashtable(STOP_WORDS.Length);
-
- for (int i = 0; i < STOP_WORDS.Length; i++)
- stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
- }
-
- public override Token Next()
- {
-
- for (Token token = input.Next(); token != null; token = input.Next())
- {
- String text = token.TermText();
-
- // why not key off token type here assuming ChineseTokenizer comes first?
- if (stopTable[text] == null)
- {
- switch (Char.GetUnicodeCategory(text[0]))
- {
-
- case UnicodeCategory.LowercaseLetter:
- case UnicodeCategory.UppercaseLetter:
-
- // English word/token should larger than 1 character.
- if (text.Length > 1)
- {
- return token;
- }
- break;
- case UnicodeCategory.OtherLetter:
-
- // One Chinese character as one Chinese word.
- // Chinese word extraction to be added later here.
-
- return token;
- }
-
- }
-
- }
- return null;
- }
- }
+ // TODO: convert this XML code to valid .NET
+ /// <summary>
+ /// A {@link TokenFilter} with a stop word table.
+ /// <ul>
+ /// <li>Numeric tokens are removed.</li>
+ /// <li>English tokens must be larger than 1 char.</li>
+ /// <li>One Chinese char as one Chinese word.</li>
+ /// </ul>
+ /// TO DO:
+ /// <ol>
+ /// <li>Add Chinese stop words, such as \ue400</li>
+ /// <li>Dictionary based Chinese word extraction</li>
+ /// <li>Intelligent Chinese word extraction</li>
+ /// </ol>
+ /// </summary>
+ public sealed class ChineseFilter : TokenFilter
+ {
+ // Only English now, Chinese to be added later.
+ public static String[] STOP_WORDS =
+ {
+ "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+
+ private CharArraySet stopTable;
+ private TermAttribute termAtt;
+
+ public ChineseFilter(TokenStream _in)
+ : base(_in)
+ {
+ stopTable = new CharArraySet(STOP_WORDS, false);
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ while (input.IncrementToken())
+ {
+ char[] text = termAtt.TermBuffer();
+ int termLength = termAtt.TermLength();
+
+ // why not key off token type here assuming ChineseTokenizer comes first?
+ if (!stopTable.Contains(text, 0, termLength))
+ {
+ switch (char.GetUnicodeCategory(text[0]))
+ {
+ case UnicodeCategory.LowercaseLetter:
+ case UnicodeCategory.UppercaseLetter:
+ // English word/token should larger than 1 char.
+ if (termLength > 1)
+ {
+ return true;
+ }
+ break;
+ case UnicodeCategory.OtherLetter:
+ // One Chinese char as one Chinese word.
+ // Chinese word extraction to be added later here.
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ }
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs Tue Feb 28 22:43:08 2012
@@ -26,175 +26,166 @@ using System.Collections;
using System.Globalization;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Cn
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// Title: ChineseTokenizer
- /// Description: Extract tokens from the Stream using Character.getType()
- /// Rule: A Chinese character as a single token
- /// Copyright: Copyright (c) 2001
- /// Company:
- ///
- /// The difference between thr ChineseTokenizer and the
- /// CJKTokenizer (id=23545) is that they have different
- /// token parsing logic.
- ///
- /// Let me use an example. If having a Chinese text
- /// "C1C2C3C4" to be indexed, the tokens returned from the
- /// ChineseTokenizer are C1, C2, C3, C4. And the tokens
- /// returned from the CJKTokenizer are C1C2, C2C3, C3C4.
- ///
- /// Therefore the index the CJKTokenizer created is much
- /// larger.
- ///
- /// The problem is that when searching for C1, C1C2, C1C3,
- /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
- /// CJKTokenizer will not work.
- /// <author>Yiyi Sun</author>
- /// <version>$Id: ChineseTokenizer.java, v 1.4 2003/03/02 13:56:03 otis Exp $</version>
- /// </summary>
- public sealed class ChineseTokenizer : Tokenizer
- {
-
-
- public ChineseTokenizer(TextReader _in)
- {
- input = _in;
- }
-
- private int offset = 0, bufferIndex=0, dataLen=0;
- private static int MAX_WORD_LEN = 255;
- private static int IO_BUFFER_SIZE = 1024;
- private char[] buffer = new char[MAX_WORD_LEN];
- private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
- private int length;
- private int start;
-
- private void Push(char c)
- {
-
- if (length == 0) start = offset-1; // start of token
- buffer[length++] = Char.ToLower(c); // buffer it
-
- }
-
- private Token Flush()
- {
-
- if (length > 0)
- {
- //System.out.println(new String(buffer, 0, length));
- return new Token(new String(buffer, 0, length), start, start+length);
- }
- else
- return null;
- }
-
- public override Token Next()
- {
-
- length = 0;
- start = offset;
-
-
- while (true)
- {
-
- char c;
- offset++;
-
- if (bufferIndex >= dataLen)
- {
- dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
- bufferIndex = 0;
- };
-
- if (dataLen == 0) return Flush();
- else
- c = ioBuffer[bufferIndex++];
-
-
- switch(Char.GetUnicodeCategory(c))
- {
-
- case UnicodeCategory.DecimalDigitNumber:
- case UnicodeCategory.LowercaseLetter:
- case UnicodeCategory.UppercaseLetter:
- Push(c);
- if (length == MAX_WORD_LEN) return Flush();
- break;
-
- case UnicodeCategory.OtherLetter:
- if (length>0)
- {
- bufferIndex--;
- return Flush();
- }
- Push(c);
- return Flush();
-
- default:
- if (length>0) return Flush();
- break;
- }
- }
-
- }
- }
+ /// <summary>
+ /// Tokenize Chinese text as individual chinese chars.
+ /// <p>
+ /// The difference between ChineseTokenizer and
+ /// CJKTokenizer is that they have different
+ /// token parsing logic.
+ /// </p>
+ /// <p>
+ /// For example, if the Chinese text
+ /// "C1C2C3C4" is to be indexed:
+ /// <ul>
+ /// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</li>
+ /// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</li>
+ /// </ul>
+ /// </p>
+ /// <p>
+ /// Therefore the index created by CJKTokenizer is much larger.
+ /// </p>
+ /// <p>
+ /// The problem is that when searching for C1, C1C2, C1C3,
+ /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+ /// CJKTokenizer will not work.
+ /// </p>
+ /// </summary>
+ public sealed class ChineseTokenizer : Tokenizer
+ {
+ public ChineseTokenizer(TextReader _in)
+ : base(_in)
+ {
+ Init();
+ }
+
+ public ChineseTokenizer(AttributeSource source, TextReader _in)
+ : base(source, _in)
+ {
+ Init();
+ }
+
+ public ChineseTokenizer(AttributeFactory factory, TextReader _in)
+ : base(factory, _in)
+ {
+ Init();
+ }
+
+ private void Init()
+ {
+ termAtt = AddAttribute<TermAttribute>();
+ offsetAtt = AddAttribute<OffsetAttribute>();
+ }
+
+ private int offset = 0, bufferIndex = 0, dataLen = 0;
+ private static readonly int MAX_WORD_LEN = 255;
+ private static readonly int IO_BUFFER_SIZE = 1024;
+ private readonly char[] buffer = new char[MAX_WORD_LEN];
+ private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private int length;
+ private int start;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ private void Push(char c)
+ {
+ if (length == 0) start = offset - 1; // start of token
+ buffer[length++] = Char.ToLower(c); // buffer it
+ }
+
+ private bool Flush()
+ {
+
+ if (length > 0)
+ {
+ termAtt.SetTermBuffer(buffer, 0, length);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+ return true;
+ }
+ else
+ return false;
+ }
+
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+
+ length = 0;
+ start = offset;
+
+
+ while (true)
+ {
+
+ char c;
+ offset++;
+
+ if (bufferIndex >= dataLen)
+ {
+ dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == 0)
+ {
+ offset--;
+ return Flush();
+ }
+ else
+ c = ioBuffer[bufferIndex++];
+
+
+ switch (char.GetUnicodeCategory(c))
+ {
+
+ case UnicodeCategory.DecimalDigitNumber:
+ case UnicodeCategory.LowercaseLetter:
+ case UnicodeCategory.UppercaseLetter:
+ Push(c);
+ if (length == MAX_WORD_LEN) return Flush();
+ break;
+
+ case UnicodeCategory.OtherLetter:
+ if (length > 0)
+ {
+ bufferIndex--;
+ offset--;
+ return Flush();
+ }
+ Push(c);
+ return Flush();
+
+ default:
+ if (length > 0) return Flush();
+ break;
+ }
+ }
+ }
+
+ public override sealed void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(offset);
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ offset = bufferIndex = dataLen = 0;
+ }
+
+ public override void Reset(TextReader input)
+ {
+ base.Reset(input);
+ Reset();
+ }
+ }
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Tue Feb 28 22:43:08 2012
@@ -19,7 +19,6 @@
under the License.
-->
-
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
@@ -29,7 +28,7 @@
<ProjectGuid>{4286E961-9143-4821-B46D-3D39D3736386}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
- <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+ <RootNamespace>Lucene.Net.Analysis</RootNamespace>
<AssemblyName>Lucene.Net.Contrib.Analyzers</AssemblyName>
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
@@ -84,16 +83,34 @@
<Compile Include="Cn\ChineseAnalyzer.cs" />
<Compile Include="Cn\ChineseFilter.cs" />
<Compile Include="Cn\ChineseTokenizer.cs" />
+ <Compile Include="Compound\CompoundWordTokenFilterBase.cs" />
+ <Compile Include="Compound\DictionaryCompoundWordTokenFilter.cs" />
+ <Compile Include="Compound\HyphenationCompoundWordTokenFilter.cs" />
+ <Compile Include="Compound\Hyphenation\ByteVector.cs" />
+ <Compile Include="Compound\Hyphenation\CharVector.cs" />
+ <Compile Include="Compound\Hyphenation\Hyphen.cs" />
+ <Compile Include="Compound\Hyphenation\Hyphenation.cs" />
+ <Compile Include="Compound\Hyphenation\HyphenationException.cs" />
+ <Compile Include="Compound\Hyphenation\HyphenationTree.cs" />
+ <Compile Include="Compound\Hyphenation\PatternConsumer.cs" />
+ <Compile Include="Compound\Hyphenation\PatternParser.cs" />
+ <Compile Include="Compound\Hyphenation\TernaryTree.cs" />
<Compile Include="Cz\CzechAnalyzer.cs" />
<Compile Include="De\GermanAnalyzer.cs" />
<Compile Include="De\GermanStemFilter.cs" />
<Compile Include="De\GermanStemmer.cs" />
- <Compile Include="De\WordlistLoader.cs" />
+ <Compile Include="El\GreekAnalyzer.cs" />
+ <Compile Include="El\GreekLowerCaseFilter.cs" />
+ <Compile Include="Fa\PersianAnalyzer.cs" />
+ <Compile Include="Fa\PersianNormalizationFilter.cs" />
+ <Compile Include="Fa\PersianNormalizer.cs" />
+ <Compile Include="Fr\ElisionFilter.cs" />
<Compile Include="Fr\FrenchAnalyzer.cs" />
<Compile Include="Fr\FrenchStemFilter.cs" />
<Compile Include="Fr\FrenchStemmer.cs" />
<Compile Include="Miscellaneous\EmptyTokenStream.cs" />
<Compile Include="Miscellaneous\InjectablePrefixAwareTokenFilter.cs" />
+ <Compile Include="Miscellaneous\PatternAnalyzer.cs" />
<Compile Include="Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" />
<Compile Include="Miscellaneous\PrefixAwareTokenStream.cs" />
<Compile Include="Miscellaneous\SingleTokenTokenStream.cs" />
@@ -104,10 +121,20 @@
<Compile Include="Nl\DutchAnalyzer.cs" />
<Compile Include="Nl\DutchStemFilter.cs" />
<Compile Include="Nl\DutchStemmer.cs" />
- <Compile Include="Nl\WordlistLoader.cs" />
+ <Compile Include="Payloads\AbstractEncoder.cs" />
+ <Compile Include="Payloads\DelimitedPayloadTokenFilter.cs" />
+ <Compile Include="Payloads\FloatEncoder.cs" />
+ <Compile Include="Payloads\IdentityEncoder.cs" />
+ <Compile Include="Payloads\IntegerEncoder.cs" />
+ <Compile Include="Payloads\NumericPayloadTokenFilter.cs" />
+ <Compile Include="Payloads\PayloadEncoder.cs" />
<Compile Include="Payloads\PayloadHelper.cs" />
+ <Compile Include="Payloads\TokenOffsetPayloadTokenFilter.cs" />
+ <Compile Include="Payloads\TypeAsPayloadTokenFilter.cs" />
+ <Compile Include="Position\PositionFilter.cs" />
+ <Compile Include="Query\QueryAutoStopWordAnalyzer.cs" />
+ <Compile Include="Reverse\ReverseStringFilter.cs" />
<Compile Include="Ru\RussianAnalyzer.cs" />
- <Compile Include="Ru\RussianCharsets.cs" />
<Compile Include="Ru\RussianLetterTokenizer.cs" />
<Compile Include="Ru\RussianLowerCaseFilter.cs" />
<Compile Include="Ru\RussianStemFilter.cs" />
@@ -125,6 +152,11 @@
<Compile Include="Shingle\Codec\SimpleThreeDimensionalTokenSettingsCodec.cs" />
<Compile Include="Shingle\Codec\TokenSettingsCodec.cs" />
<Compile Include="Shingle\Codec\TwoDimensionalNonWeightedSynonymTokenSettingsCodec.cs" />
+ <Compile Include="Sinks\DateRecognizerSinkFilter.cs" />
+ <Compile Include="Sinks\TokenRangeSinkFilter.cs" />
+ <Compile Include="Sinks\TokenTypeSinkFilter.cs" />
+ <Compile Include="Th\ThaiAnalyzer.cs" />
+ <Compile Include="Th\ThaiWordFilter.cs" />
<Compile Include="WordlistLoader.cs" />
</ItemGroup>
<ItemGroup>
@@ -137,8 +169,12 @@
</ProjectReference>
</ItemGroup>
<ItemGroup>
+ <None Include="Compound\Hyphenation\hyphenation.dtd" />
<None Include="Lucene.Net.snk" />
</ItemGroup>
+ <ItemGroup>
+ <Content Include="FileDiffs.txt" />
+ </ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.