You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/02/03 17:51:14 UTC
[08/11] lucenenet git commit: Lucene.Net.Analysis.Ngram refactor:
member accessibility and documentation comments
Lucene.Net.Analysis.Ngram refactor: member accessibility and documentation comments
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/269da1ef
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/269da1ef
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/269da1ef
Branch: refs/heads/api-work
Commit: 269da1ef4ecb679c0e13c914fab3f60c175d9466
Parents: d4b9c00
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Sat Feb 4 00:01:15 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Sat Feb 4 00:01:15 2017 +0700
----------------------------------------------------------------------
.../Analysis/Ngram/EdgeNGramFilterFactory.cs | 4 +-
.../Analysis/Ngram/EdgeNGramTokenFilter.cs | 29 ++---
.../Analysis/Ngram/EdgeNGramTokenizer.cs | 31 +++---
.../Analysis/Ngram/EdgeNGramTokenizerFactory.cs | 4 +-
.../Ngram/Lucene43EdgeNGramTokenizer.cs | 47 ++++----
.../Analysis/Ngram/Lucene43NGramTokenizer.cs | 12 +--
.../Analysis/Ngram/NGramFilterFactory.cs | 4 +-
.../Analysis/Ngram/NGramTokenFilter.cs | 33 +++---
.../Analysis/Ngram/NGramTokenizer.cs | 107 +++++++++++++------
.../Analysis/Ngram/NGramTokenizerFactory.cs | 4 +-
.../Analysis/Ngram/NGramTokenizerTest.cs | 2 +-
11 files changed, 161 insertions(+), 116 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
index 2efb5fc..2e3e0ed 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
@@ -1,7 +1,7 @@
\ufeffusing Lucene.Net.Analysis.Util;
using System.Collections.Generic;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -37,7 +37,7 @@ namespace Lucene.Net.Analysis.Ngram
private readonly string side;
/// <summary>
- /// Creates a new EdgeNGramFilterFactory </summary>
+ /// Creates a new <see cref="EdgeNGramFilterFactory"/> </summary>
public EdgeNGramFilterFactory(IDictionary<string, string> args)
: base(args)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
index 01677cf..4c1fff1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
@@ -1,9 +1,9 @@
-\ufeffusing System;
-using Lucene.Net.Analysis.TokenAttributes;
+\ufeffusing Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
+using System;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -27,8 +27,8 @@ namespace Lucene.Net.Analysis.Ngram
/// <para>
/// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
/// </para>
- /// <para><a name="version"/>As of Lucene 4.4, this filter does not support
- /// <see cref="Side#BACK"/> (you can use <see cref="ReverseStringFilter"/> up-front and
+ /// <para>As of Lucene 4.4, this filter does not support
+ /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and
/// afterward to get the same behavior), handles supplementary characters
/// correctly and does not update offsets anymore.
/// </para>
@@ -43,7 +43,6 @@ namespace Lucene.Net.Analysis.Ngram
/// Specifies which side of the input the n-gram should be generated from </summary>
public enum Side
{
-
/// <summary>
/// Get the n-gram from the front of the input </summary>
FRONT,
@@ -54,7 +53,9 @@ namespace Lucene.Net.Analysis.Ngram
BACK,
}
- // Get the appropriate Side from a string
+ /// <summary>
+ /// Get the appropriate <see cref="Side"/> from a string
+ /// </summary>
public static Side GetSide(string sideName)
{
Side result;
@@ -86,9 +87,9 @@ namespace Lucene.Net.Analysis.Ngram
private readonly IPositionLengthAttribute posLenAtt;
/// <summary>
- /// Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
/// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
@@ -136,9 +137,9 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
/// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
@@ -150,9 +151,9 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -163,7 +164,7 @@ namespace Lucene.Net.Analysis.Ngram
{
}
- public override bool IncrementToken()
+ public override sealed bool IncrementToken()
{
while (true)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
index 09ad7f8..9eba29f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
@@ -1,7 +1,7 @@
\ufeffusing Lucene.Net.Util;
using System.IO;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -25,14 +25,15 @@ namespace Lucene.Net.Analysis.Ngram
/// <para>
/// This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
/// </para>
- /// <para><a name="version" /> As of Lucene 4.4, this tokenizer<ul>
- /// <li>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage
- /// <li>doesn't trim the input,
- /// <li>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones
- /// <li>doesn't support backward n-grams anymore.
- /// <li>supports <see cref="#isTokenChar(int) pre-tokenization"/>,
- /// <li>correctly handles supplementary characters.
- /// </ul>
+ /// <para>As of Lucene 4.4, this tokenizer
+ /// <list type="bullet">
+ /// <item>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage</item>
+ /// <item>doesn't trim the input,</item>
+ /// <item>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones</item>
+ /// <item>doesn't support backward n-grams anymore.</item>
+ /// <item>supports <see cref="Util.CharTokenizer.IsTokenChar(int)"/> pre-tokenization,</item>
+ /// <item>correctly handles supplementary characters.</item>
+ /// </list>
/// </para>
/// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
/// to use the old behavior through <see cref="Lucene43EdgeNGramTokenizer"/>.
@@ -44,10 +45,10 @@ namespace Lucene.Net.Analysis.Ngram
public const int DEFAULT_MIN_GRAM_SIZE = 1;
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
@@ -58,9 +59,9 @@ namespace Lucene.Net.Analysis.Ngram
/// <summary>
/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
index 5273ae4..d3f2bb6 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
@@ -4,7 +4,7 @@ using System;
using System.Collections.Generic;
using System.IO;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -39,7 +39,7 @@ namespace Lucene.Net.Analysis.Ngram
private readonly string side;
/// <summary>
- /// Creates a new EdgeNGramTokenizerFactory </summary>
+ /// Creates a new <see cref="EdgeNGramTokenizerFactory"/> </summary>
public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args)
{
minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
index 3ed7187..eb09a94 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
@@ -3,7 +3,7 @@ using Lucene.Net.Util;
using System;
using System.IO;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -41,7 +41,6 @@ namespace Lucene.Net.Analysis.Ngram
/// Specifies which side of the input the n-gram should be generated from </summary>
public enum Side
{
-
/// <summary>
/// Get the n-gram from the front of the input </summary>
FRONT,
@@ -52,7 +51,7 @@ namespace Lucene.Net.Analysis.Ngram
}
// Get the appropriate Side from a string
- internal static Side GetSide(string sideName)
+ public static Side GetSide(string sideName)
{
Side result;
if (!Enum.TryParse(sideName, true, out result))
@@ -73,10 +72,10 @@ namespace Lucene.Net.Analysis.Ngram
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -88,11 +87,11 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -104,10 +103,10 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -118,11 +117,11 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -133,10 +132,10 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
@@ -145,11 +144,11 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
/// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
index a0f210a..a79ffba 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
@@ -2,7 +2,7 @@
using System;
using System.IO;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -42,7 +42,7 @@ namespace Lucene.Net.Analysis.Ngram
private IOffsetAttribute offsetAtt;
/// <summary>
- /// Creates NGramTokenizer with given min and max n-grams. </summary>
+ /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
/// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -53,9 +53,9 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates NGramTokenizer with given min and max n-grams. </summary>
- /// <param name="factory"> <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
@@ -65,7 +65,7 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates NGramTokenizer with default min and max n-grams. </summary>
+ /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary>
/// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
public Lucene43NGramTokenizer(TextReader input)
: this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
index 3c9f738..8b9b726 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
@@ -1,7 +1,7 @@
\ufeffusing Lucene.Net.Analysis.Util;
using System.Collections.Generic;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -36,7 +36,7 @@ namespace Lucene.Net.Analysis.Ngram
private readonly int minGramSize;
/// <summary>
- /// Creates a new NGramFilterFactory </summary>
+ /// Creates a new <see cref="NGramFilterFactory"/> </summary>
public NGramFilterFactory(IDictionary<string, string> args)
: base(args)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
index 561e575..26cc8d5 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
@@ -3,7 +3,7 @@ using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -24,25 +24,26 @@ namespace Lucene.Net.Analysis.Ngram
/// <summary>
/// Tokenizes the input into n-grams of the given size(s).
- /// <a name="version"/>
/// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
- /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:<ul>
- /// <li>handles supplementary characters correctly,</li>
- /// <li>emits all n-grams for the same token at the same position,</li>
- /// <li>does not modify offsets,</li>
- /// <li>sorts n-grams by their offset in the original token first, then
- /// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
- /// "c").</li></ul>
+ /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
+ /// <list type="bullet">
+ /// <item>handles supplementary characters correctly,</item>
+ /// <item>emits all n-grams for the same token at the same position,</item>
+ /// <item>does not modify offsets,</item>
+ /// <item>sorts n-grams by their offset in the original token first, then
+ /// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+ /// "c").</item>
+ /// </list>
/// </para>
/// <para>You can make this filter use the old behavior by providing a version <
- /// <see cref="Version#LUCENE_44"/> in the constructor but this is not recommended as
+ /// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
/// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
/// bugs.
/// </para>
/// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
/// this won't work anymore since this filter doesn't update offsets. You should
/// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
- /// override <see cref="NGramTokenizer#isTokenChar(int)"/> to perform pre-tokenization.
+ /// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
/// </para>
/// </summary>
public sealed class NGramTokenFilter : TokenFilter
@@ -70,9 +71,9 @@ namespace Lucene.Net.Analysis.Ngram
private readonly IOffsetAttribute offsetAtt;
/// <summary>
- /// Creates NGramTokenFilter with given min and max n-grams. </summary>
+ /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
/// <param name="version"> Lucene version to enable correct position increments.
- /// See <a href="#version">above</a> for details. </param>
+ /// See <see cref="NGramTokenFilter"/> for details. </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -154,9 +155,9 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates NGramTokenFilter with default min and max n-grams. </summary>
+ /// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
/// <param name="version"> Lucene version to enable correct position increments.
- /// See <a href="#version">above</a> for details. </param>
+ /// See <see cref="NGramTokenFilter"/> for details. </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
public NGramTokenFilter(LuceneVersion version, TokenStream input)
: this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
@@ -166,7 +167,7 @@ namespace Lucene.Net.Analysis.Ngram
/// <summary>
/// Returns the next token in the stream, or null at EOS.
/// </summary>
- public override bool IncrementToken()
+ public override sealed bool IncrementToken()
{
while (true)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
index acc42c3..a6ce01d 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
@@ -6,7 +6,7 @@ using System;
using System.Diagnostics;
using System.IO;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -32,21 +32,66 @@ namespace Lucene.Net.Analysis.Ngram
/// the same as the term chars.
/// </para>
/// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
- /// <table>
- /// <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
- /// <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
- /// <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
- /// <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
- /// </table>
- /// <a name="version"/>
+ /// <list type="table">
+ /// <listheader>
+ /// <term>Term</term>
+ /// <term>Position increment</term>
+ /// <term>Position length</term>
+ /// <term>Offsets</term>
+ /// </listheader>
+ /// <item>
+ /// <term>ab</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[0,2[</term>
+ /// </item>
+ /// <item>
+ /// <term>abc</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[0,3[</term>
+ /// </item>
+ /// <item>
+ /// <term>bc</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[1,3[</term>
+ /// </item>
+ /// <item>
+ /// <term>bcd</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[1,4[</term>
+ /// </item>
+ /// <item>
+ /// <term>cd</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[2,4[</term>
+ /// </item>
+ /// <item>
+ /// <term>cde</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[2,5[</term>
+ /// </item>
+ /// <item>
+ /// <term>de</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[3,5[</term>
+ /// </item>
+ /// </list>
/// </para>
- /// <para>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
- /// <li>tokenize in a streaming fashion to support streams which are larger
- /// than 1024 chars (limit of the previous version),
- /// <li>count grams based on unicode code points instead of java chars (and
- /// never split in the middle of surrogate pairs),
- /// <li>give the ability to <see cref="#isTokenChar(int) pre-tokenize"/> the stream
- /// before computing n-grams.</ul>
+ /// <para>This tokenizer changed a lot in Lucene 4.4 in order to:
+ /// <list type="bullet">
+ /// <item>tokenize in a streaming fashion to support streams which are larger
+ /// than 1024 chars (limit of the previous version),</item>
+ /// <item>count grams based on unicode code points instead of java chars (and
+ /// never split in the middle of surrogate pairs),</item>
+ /// <item>give the ability to pre-tokenize the stream (<see cref="IsTokenChar(int)"/>)
+ /// before computing n-grams.</item>
+ /// </list>
/// </para>
/// <para>Additionally, this class doesn't trim trailing whitespaces and emits
/// tokens in a different order, tokens are now emitted by increasing start
@@ -57,7 +102,7 @@ namespace Lucene.Net.Analysis.Ngram
/// to use the old behavior through <see cref="Lucene43NGramTokenizer"/>.
/// </para>
/// </summary>
- // non-final to allow for overriding isTokenChar, but all other methods should be final
+ // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed
public class NGramTokenizer : Tokenizer
{
public const int DEFAULT_MIN_NGRAM_SIZE = 1;
@@ -87,8 +132,8 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates NGramTokenizer with given min and max n-grams. </summary>
- /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
+ /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="version"> the lucene compatibility version </param>
/// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
@@ -104,10 +149,10 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates NGramTokenizer with given min and max n-grams. </summary>
- /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
- /// <param name="factory"> <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="Reader"/> holding the input to be tokenized </param>
+ /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="version"> the lucene compatibility version </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
@@ -116,8 +161,8 @@ namespace Lucene.Net.Analysis.Ngram
}
/// <summary>
- /// Creates NGramTokenizer with default min and max n-grams. </summary>
- /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
+ /// Creates <see cref="NGramTokenizer"/> with default min and max n-grams. </summary>
+ /// <param name="version"> the lucene compatibility version </param>
/// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
public NGramTokenizer(LuceneVersion version, TextReader input)
: this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
@@ -154,8 +199,6 @@ namespace Lucene.Net.Analysis.Ngram
charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
buffer = new int[charBuffer.Buffer.Length];
-
-
// Make the term att large enough
termAtt.ResizeBuffer(2 * maxGram);
}
@@ -191,7 +234,7 @@ namespace Lucene.Net.Analysis.Ngram
Debug.Assert(exhausted);
return false;
}
- consume();
+ Consume();
gramSize = minGram;
}
@@ -202,7 +245,7 @@ namespace Lucene.Net.Analysis.Ngram
bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
{
- consume();
+ Consume();
gramSize = minGram;
continue;
}
@@ -236,19 +279,19 @@ namespace Lucene.Net.Analysis.Ngram
/// <summary>
/// Consume one code point. </summary>
- private void consume()
+ private void Consume()
{
offset += Character.CharCount(buffer[bufferStart++]);
}
/// <summary>
/// Only collect characters which satisfy this condition. </summary>
- protected internal virtual bool IsTokenChar(int chr)
+ protected virtual bool IsTokenChar(int chr)
{
return true;
}
- public override void End()
+ public override sealed void End()
{
base.End();
Debug.Assert(bufferStart <= bufferEnd);
@@ -262,7 +305,7 @@ namespace Lucene.Net.Analysis.Ngram
offsetAtt.SetOffset(endOffset, endOffset);
}
- public override void Reset()
+ public override sealed void Reset()
{
base.Reset();
bufferStart = bufferEnd = buffer.Length;
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
index 73865fb..33a81b2 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
@@ -3,7 +3,7 @@ using Lucene.Net.Util;
using System.Collections.Generic;
using System.IO;
-namespace Lucene.Net.Analysis.Ngram
+namespace Lucene.Net.Analysis.Ngram // LUCENENET TODO: Change namespace, directory, and Git to NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -37,7 +37,7 @@ namespace Lucene.Net.Analysis.Ngram
private readonly int minGramSize;
/// <summary>
- /// Creates a new NGramTokenizerFactory </summary>
+ /// Creates a new <see cref="NGramTokenizerFactory"/> </summary>
public NGramTokenizerFactory(IDictionary<string, string> args)
: base(args)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/269da1ef/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenizerTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenizerTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenizerTest.cs
index b4aac99..d72f4c5 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenizerTest.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenizerTest.cs
@@ -233,7 +233,7 @@ namespace Lucene.Net.Analysis.Ngram
this.nonTokenChars = nonTokenChars;
}
- protected internal override bool IsTokenChar(int chr)
+ protected override bool IsTokenChar(int chr)
{
return nonTokenChars.IndexOf((char)chr) < 0;
}