You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/02/28 23:43:28 UTC
[Lucene.Net] svn commit: r1294875 [7/45] - in /incubator/lucene.net/trunk: ./ build/
build/vs2010/contrib/ build/vs2010/test/ doc/ src/ src/contrib/Analyzers/
src/contrib/Analyzers/AR/ src/contrib/Analyzers/BR/
src/contrib/Analyzers/CJK/ src/contrib/Analyzers/Cn/ s...
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleFilter.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -17,368 +17,366 @@
using System;
using System.Collections.Generic;
-using System.IO;
+using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
namespace Lucene.Net.Analyzers.Shingle
{
- /// <summary>
- /// <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
- /// In other words, it creates combinations of tokens as a single token.</p>
- ///
- /// <p>For example, the sentence "please divide this sentence into shingles"
- /// might be tokenized into shingles "please divide", "divide this",
- /// "this sentence", "sentence into", and "into shingles".</p>
- ///
- /// <p>This filter handles position increments > 1 by inserting filler tokens
- /// (tokens with termtext "_"). It does not handle a position increment of 0. </p>
- /// </summary>
- public class ShingleFilter : TokenFilter
+ /**
+ * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
+ * In other words, it creates combinations of tokens as a single token.
+ *
+ * <p>For example, the sentence "please divide this sentence into shingles"
+ * might be tokenized into shingles "please divide", "divide this",
+ * "this sentence", "sentence into", and "into shingles".
+ *
+ * <p>This filter handles position increments > 1 by inserting filler tokens
+ * (tokens with termtext "_"). It does not handle a position increment of 0.
+ */
+ public sealed class ShingleFilter : TokenFilter
{
- /// <summary>
- /// Filler token for when positionIncrement is more than 1
- /// </summary>
- public static readonly char[] FillerToken = {'_'};
-
- /// <summary>
- /// Default maximum shingle size is 2.
- /// </summary>
- public static readonly int DefaultMaxShingleSize = 2;
-
- /// <summary>
- /// The string to use when joining adjacent tokens to form a shingle
- /// </summary>
- public static readonly string TokenSeparator = " ";
-
- private readonly OffsetAttribute _offsetAtt;
- private readonly PositionIncrementAttribute _posIncrAtt;
-
- private readonly LinkedList<State> _shingleBuf = new LinkedList<State>();
- private readonly TermAttribute _termAtt;
- private readonly TypeAttribute _typeAtt;
- private State _currentToken;
- private int[] _endOffsets;
- private bool _hasCurrentToken;
-
- /// <summary>
- /// Maximum shingle size (number of tokens)
- /// </summary>
- private int _maxShingleSize;
-
- private State _nextToken;
- private int _numFillerTokensToInsert;
-
- /// <summary>
- /// By default, we output unigrams (individual tokens) as well as shingles (token n-grams).
- /// </summary>
- private bool _outputUnigrams = true;
-
- private int _shingleBufferPosition;
- private StringBuilder[] _shingles;
- private String _tokenType = "shingle";
-
- /// <summary>
- /// Constructs a ShingleFilter with the specified single size from the TokenStream
- /// </summary>
- /// <param name="input">input token stream</param>
- /// <param name="maxShingleSize">maximum shingle size produced by the filter.</param>
- public ShingleFilter(TokenStream input, int maxShingleSize) : base(input)
- {
- SetMaxShingleSize(maxShingleSize);
-
- // ReSharper disable DoNotCallOverridableMethodsInConstructor
- _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
- _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
- _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
- _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
- // ReSharper restore DoNotCallOverridableMethodsInConstructor
- }
- /// <summary>
- /// Construct a ShingleFilter with default shingle size.
- /// </summary>
- /// <param name="input">input stream</param>
- public ShingleFilter(TokenStream input) :
- this(input, DefaultMaxShingleSize)
+ private LinkedList<State> shingleBuf = new LinkedList<State>();
+ private StringBuilder[] shingles;
+ private String tokenType = "shingle";
+
+ /**
+ * filler token for when positionIncrement is more than 1
+ */
+ public static readonly char[] FILLER_TOKEN = { '_' };
+
+
+ /**
+ * default maximum shingle size is 2.
+ */
+ public const int DEFAULT_MAX_SHINGLE_SIZE = 2;
+
+ /**
+ * The string to use when joining adjacent tokens to form a shingle
+ */
+ public const String TOKEN_SEPARATOR = " ";
+
+ /**
+ * By default, we output unigrams (individual tokens) as well as shingles
+ * (token n-grams).
+ */
+ private bool outputUnigrams = true;
+
+ /**
+ * maximum shingle size (number of tokens)
+ */
+ private int maxShingleSize;
+
+ /**
+ * Constructs a ShingleFilter with the specified single size from the
+ * {@link TokenStream} <code>input</code>
+ *
+ * @param input input stream
+ * @param maxShingleSize maximum shingle size produced by the filter.
+ */
+ public ShingleFilter(TokenStream input, int maxShingleSize)
+ : base(input)
{
- }
-
- /// <summary>
- /// Construct a ShingleFilter with the specified token type for shingle tokens.
- /// </summary>
- /// <param name="input">input stream</param>
- /// <param name="tokenType">token type for shingle tokens</param>
- public ShingleFilter(TokenStream input, String tokenType) :
- this(input, DefaultMaxShingleSize)
- {
- SetTokenType(tokenType);
- }
-
- /// <summary>
- /// Set the type of the shingle tokens produced by this filter. (default: "shingle")
- /// </summary>
- /// <param name="tokenType">token TokenType</param>
- public void SetTokenType(String tokenType)
+ SetMaxShingleSize(maxShingleSize);
+ this.termAtt = AddAttribute<TermAttribute>(); ;
+ this.offsetAtt = AddAttribute<OffsetAttribute>(); ;
+ this.posIncrAtt = AddAttribute<PositionIncrementAttribute>(); ;
+ this.typeAtt = AddAttribute<TypeAttribute>(); ;
+ }
+
+ /**
+ * Construct a ShingleFilter with default shingle size.
+ *
+ * @param input input stream
+ */
+ public ShingleFilter(TokenStream input)
+ : this(input, DEFAULT_MAX_SHINGLE_SIZE)
{
- _tokenType = tokenType;
}
- /// <summary>
- /// Shall the output stream contain the input tokens (unigrams) as well as shingles? (default: true.)
- /// </summary>
- /// <param name="outputUnigrams">Whether or not the output stream shall contain the input tokens (unigrams)</param>
+ /**
+ * Construct a ShingleFilter with the specified token type for shingle tokens.
+ *
+ * @param input input stream
+ * @param tokenType token type for shingle tokens
+ */
+ public ShingleFilter(TokenStream input, String tokenType)
+ : this(input, DEFAULT_MAX_SHINGLE_SIZE)
+ {
+ setTokenType(tokenType);
+ }
+
+ /**
+ * Set the type of the shingle tokens produced by this filter.
+ * (default: "shingle")
+ *
+ * @param tokenType token tokenType
+ */
+ public void setTokenType(String tokenType)
+ {
+ this.tokenType = tokenType;
+ }
+
+ /**
+ * Shall the output stream contain the input tokens (unigrams) as well as
+ * shingles? (default: true.)
+ *
+ * @param outputUnigrams Whether or not the output stream shall contain
+ * the input tokens (unigrams)
+ */
public void SetOutputUnigrams(bool outputUnigrams)
{
- _outputUnigrams = outputUnigrams;
+ this.outputUnigrams = outputUnigrams;
}
- /// <summary>
- /// Set the max shingle size (default: 2)
- /// </summary>
- /// <param name="maxShingleSize">max size of output shingles</param>
+ /**
+ * Set the max shingle size (default: 2)
+ *
+ * @param maxShingleSize max size of output shingles
+ */
public void SetMaxShingleSize(int maxShingleSize)
{
if (maxShingleSize < 2)
- throw new ArgumentException("Max shingle size must be >= 2", "maxShingleSize");
-
- _shingles = new StringBuilder[maxShingleSize];
-
- for (int i = 0; i < _shingles.Length; i++)
{
- _shingles[i] = new StringBuilder();
+ throw new ArgumentException("Max shingle size must be >= 2");
}
-
- _maxShingleSize = maxShingleSize;
+ shingles = new StringBuilder[maxShingleSize];
+ for (int i = 0; i < shingles.Length; i++)
+ {
+ shingles[i] = new StringBuilder();
+ }
+ this.maxShingleSize = maxShingleSize;
}
- /// <summary>
- /// Clear the StringBuilders that are used for storing the output shingles.
- /// </summary>
+ /**
+ * Clear the StringBuilders that are used for storing the output shingles.
+ */
private void ClearShingles()
{
- foreach (StringBuilder t in _shingles)
+ for (int i = 0; i < shingles.Length; i++)
{
- t.Length = 0;
+ shingles[i].Clear();
}
}
- /// <summary>
- /// See Lucene.Net.Analysis.TokenStream.Next()
- /// </summary>
- /// <returns></returns>
- public override bool IncrementToken()
+ private AttributeSource.State nextToken;
+ private int shingleBufferPosition;
+ private int[] endOffsets;
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public sealed override bool IncrementToken()
{
while (true)
{
- if (_nextToken == null)
+ if (nextToken == null)
{
if (!FillShingleBuffer())
+ {
return false;
+ }
}
- _nextToken = _shingleBuf.First.Value;
+ nextToken = shingleBuf.First.Value;
- if (_outputUnigrams)
+ if (outputUnigrams)
{
- if (_shingleBufferPosition == 0)
+ if (shingleBufferPosition == 0)
{
- RestoreState(_nextToken);
- _posIncrAtt.SetPositionIncrement(1);
- _shingleBufferPosition++;
+ RestoreState(nextToken);
+ posIncrAtt.SetPositionIncrement(1);
+ shingleBufferPosition++;
return true;
}
}
- else if (_shingleBufferPosition%_maxShingleSize == 0)
+ else if (shingleBufferPosition % this.maxShingleSize == 0)
{
- _shingleBufferPosition++;
+ shingleBufferPosition++;
}
- if (_shingleBufferPosition < _shingleBuf.Count)
+ if (shingleBufferPosition < shingleBuf.Count)
{
- RestoreState(_nextToken);
- _typeAtt.SetType(_tokenType);
- _offsetAtt.SetOffset(_offsetAtt.StartOffset(), _endOffsets[_shingleBufferPosition]);
- StringBuilder buf = _shingles[_shingleBufferPosition];
+ RestoreState(nextToken);
+ typeAtt.SetType(tokenType);
+ offsetAtt.SetOffset(offsetAtt.StartOffset(), endOffsets[shingleBufferPosition]);
+ StringBuilder buf = shingles[shingleBufferPosition];
int termLength = buf.Length;
- char[] termBuffer = _termAtt.TermBuffer();
- if (termBuffer.Length < termLength)
- termBuffer = _termAtt.ResizeTermBuffer(termLength);
- buf.CopyTo(0, termBuffer, 0, termLength);
- _termAtt.SetTermLength(termLength);
- if ((! _outputUnigrams) && _shingleBufferPosition%_maxShingleSize == 1)
+ char[] TermBuffer = termAtt.TermBuffer();
+ if (TermBuffer.Length < termLength)
+ TermBuffer = termAtt.ResizeTermBuffer(termLength);
+ buf.CopyTo(0, TermBuffer, 0, termLength);
+ termAtt.SetTermLength(termLength);
+ if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1)
{
- _posIncrAtt.SetPositionIncrement(1);
+ posIncrAtt.SetPositionIncrement(1);
}
else
{
- _posIncrAtt.SetPositionIncrement(0);
+ posIncrAtt.SetPositionIncrement(0);
}
- _shingleBufferPosition++;
- if (_shingleBufferPosition == _shingleBuf.Count)
+ shingleBufferPosition++;
+ if (shingleBufferPosition == shingleBuf.Count)
{
- _nextToken = null;
- _shingleBufferPosition = 0;
+ nextToken = null;
+ shingleBufferPosition = 0;
}
return true;
}
-
- _nextToken = null;
- _shingleBufferPosition = 0;
+ else
+ {
+ nextToken = null;
+ shingleBufferPosition = 0;
+ }
}
}
- /// <summary>
- /// <p>
- /// Get the next token from the input stream and push it on the token buffer.
- /// If we encounter a token with position increment > 1, we put filler tokens
- /// on the token buffer.
- /// </p>
- /// Returns null when the end of the input stream is reached.
- /// </summary>
- /// <returns>the next token, or null if at end of input stream</returns>
+ private int numFillerTokensToInsert;
+ private AttributeSource.State currentToken;
+ private bool hasCurrentToken;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private TypeAttribute typeAtt;
+
+ /**
+ * Get the next token from the input stream and push it on the token buffer.
+ * If we encounter a token with position increment > 1, we put filler tokens
+ * on the token buffer.
+ * <p/>
+ * Returns null when the end of the input stream is reached.
+ * @return the next token, or null if at end of input stream
+ * @throws IOException if the input stream has a problem
+ */
private bool GetNextToken()
{
+
while (true)
{
- if (_numFillerTokensToInsert > 0)
+ if (numFillerTokensToInsert > 0)
{
- if (_currentToken == null)
+ if (currentToken == null)
{
- _currentToken = CaptureState();
+ currentToken = CaptureState();
}
else
{
- RestoreState(_currentToken);
+ RestoreState(currentToken);
}
- _numFillerTokensToInsert--;
+ numFillerTokensToInsert--;
// A filler token occupies no space
- _offsetAtt.SetOffset(_offsetAtt.StartOffset(), _offsetAtt.StartOffset());
- _termAtt.SetTermBuffer(FillerToken, 0, FillerToken.Length);
+ offsetAtt.SetOffset(offsetAtt.StartOffset(), offsetAtt.StartOffset());
+ termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length);
return true;
}
- if (_hasCurrentToken)
+ if (hasCurrentToken)
{
- if (_currentToken != null)
+ if (currentToken != null)
{
- RestoreState(_currentToken);
- _currentToken = null;
+ RestoreState(currentToken);
+ currentToken = null;
}
- _hasCurrentToken = false;
+ hasCurrentToken = false;
return true;
}
- if (!input.IncrementToken())
- return false;
+ if (!input.IncrementToken()) return false;
+ hasCurrentToken = true;
- _hasCurrentToken = true;
-
- if (_posIncrAtt.GetPositionIncrement() > 1)
- _numFillerTokensToInsert = _posIncrAtt.GetPositionIncrement() - 1;
+ if (posIncrAtt.GetPositionIncrement() > 1)
+ {
+ numFillerTokensToInsert = posIncrAtt.GetPositionIncrement() - 1;
+ }
}
}
- /// <summary>
- /// Fill the output buffer with new shingles.
- /// </summary>
- /// <exception cref="IOException">throws IOException if there's a problem getting the next token</exception>
- /// <returns></returns>
+ /**
+ * Fill the output buffer with new shingles.
+ *
+ * @throws IOException if there's a problem getting the next token
+ */
private bool FillShingleBuffer()
{
bool addedToken = false;
-
- // Try to fill the shingle buffer.
-
+ /*
+ * Try to fill the shingle buffer.
+ */
do
{
- if (!GetNextToken())
+ if (GetNextToken())
+ {
+ shingleBuf.AddLast(CaptureState());
+ if (shingleBuf.Count > maxShingleSize)
+ {
+ shingleBuf.RemoveFirst();
+ }
+ addedToken = true;
+ }
+ else
+ {
break;
+ }
+ } while (shingleBuf.Count < maxShingleSize);
- _shingleBuf.AddLast(CaptureState());
-
- if (_shingleBuf.Count > _maxShingleSize)
- _shingleBuf.RemoveFirst();
-
- addedToken = true;
- } while (_shingleBuf.Count < _maxShingleSize);
-
- if (_shingleBuf.Count == 0)
+ if (shingleBuf.Count == 0)
+ {
return false;
+ }
+ /*
+ * If no new token could be added to the shingle buffer, we have reached
+ * the end of the input stream and have to discard the least recent token.
+ */
+ if (!addedToken)
+ {
+ shingleBuf.RemoveFirst();
+ }
- // If no new token could be added to the shingle buffer, we have reached
- // the end of the input stream and have to discard the least recent token.
-
- if (! addedToken)
- _shingleBuf.RemoveFirst();
-
- if (_shingleBuf.Count == 0)
+ if (shingleBuf.Count == 0)
+ {
return false;
+ }
ClearShingles();
- _endOffsets = new int[_shingleBuf.Count];
- for (int i = 0; i < _endOffsets.Length; i++)
- {
- _endOffsets[i] = 0;
- }
-
- int shingleIndex = 0;
+ endOffsets = new int[shingleBuf.Count];
+ // Set all offsets to 0
+ endOffsets.Initialize();
- foreach (State state in _shingleBuf)
+ int i = 0;
+ for (IEnumerator<State> it = shingleBuf.GetEnumerator(); it.MoveNext(); )
{
- RestoreState(state);
-
- for (int j = shingleIndex; j < _shingles.Length; j++)
+ RestoreState(it.Current);
+ for (int j = i; j < shingles.Length; j++)
{
- if (_shingles[j].Length != 0)
- _shingles[j].Append(TokenSeparator);
-
- _shingles[j].Append(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
+ if (shingles[j].Length != 0)
+ {
+ shingles[j].Append(TOKEN_SEPARATOR);
+ }
+ shingles[j].Append(termAtt.TermBuffer().Take(termAtt.TermLength()).ToArray());
}
- _endOffsets[shingleIndex] = _offsetAtt.EndOffset();
- shingleIndex++;
+ endOffsets[i] = offsetAtt.EndOffset();
+ i++;
}
return true;
}
- /// <summary>
- /// Deprecated: Will be removed in Lucene 3.0. This method is readonly, as it should not be overridden.
- /// Delegates to the backwards compatibility layer.
- /// </summary>
- /// <param name="reusableToken"></param>
- /// <returns></returns>
- [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
- public override sealed Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /// <summary>
- /// Deprecated: Will be removed in Lucene 3.0. This method is readonly, as it should not be overridden.
- /// Delegates to the backwards compatibility layer.
- /// </summary>
- /// <returns></returns>
- [Obsolete("The returned Token is a \"full private copy\" (not re-used across calls to Next()) but will be slower than calling Next(Token) or using the new IncrementToken() method with the new AttributeSource API.")]
- public override sealed Token Next()
- {
- return base.Next();
- }
-
public override void Reset()
{
base.Reset();
-
- _nextToken = null;
- _shingleBufferPosition = 0;
- _shingleBuf.Clear();
- _numFillerTokensToInsert = 0;
- _currentToken = null;
- _hasCurrentToken = false;
+ nextToken = null;
+ shingleBufferPosition = 0;
+ shingleBuf.Clear();
+ numFillerTokensToInsert = 0;
+ currentToken = null;
+ hasCurrentToken = false;
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -20,10 +20,11 @@ using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Shingle.Codec;
+using Lucene.Net.Analysis.Shingle.Matrix;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analyzers.Miscellaneous;
-using Lucene.Net.Analyzers.Shingle.Codec;
-using Lucene.Net.Analyzers.Shingle.Matrix;
+using Lucene.Net.Support;
using FlagsAttribute = Lucene.Net.Analysis.Tokenattributes.FlagsAttribute;
namespace Lucene.Net.Analyzers.Shingle
@@ -103,7 +104,7 @@ namespace Lucene.Net.Analyzers.Shingle
/// <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
/// the ones located in org.apache.lucene.analysis.tokenattributes.</p>
/// </summary>
- public class ShingleMatrixFilter : TokenStream
+ public sealed class ShingleMatrixFilter : TokenStream
{
public static Char DefaultSpacerCharacter = '_';
public static TokenSettingsCodec DefaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
@@ -135,8 +136,8 @@ namespace Lucene.Net.Analyzers.Shingle
/// to get the same behaviour.
/// </p>
/// </summary>
- private readonly HashSet<SupportClass.EquatableList<Token>> _shinglesSeen =
- new HashSet<SupportClass.EquatableList<Token>>();
+ private readonly HashSet<EquatableList<Token>> _shinglesSeen =
+ new HashSet<EquatableList<Token>>();
private readonly TermAttribute _termAtt;
private readonly TypeAttribute _typeAtt;
@@ -158,13 +159,13 @@ namespace Lucene.Net.Analyzers.Shingle
/// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
///
/// </summary>
- /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.Next(Token) is called the first time.</param>
+ /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
/// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
/// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
/// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
/// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
/// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
- public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
+ public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
{
Matrix = matrix;
MinimumShingleSize = minimumShingleSize;
@@ -174,23 +175,23 @@ namespace Lucene.Net.Analyzers.Shingle
_settingsCodec = settingsCodec;
// ReSharper disable DoNotCallOverridableMethodsInConstructor
- _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
- _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
- _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
- _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
- _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
- _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
+ _termAtt = AddAttribute<TermAttribute>();
+ _posIncrAtt = AddAttribute<PositionIncrementAttribute>();
+ _payloadAtt = AddAttribute<PayloadAttribute>();
+ _offsetAtt = AddAttribute<OffsetAttribute>();
+ _typeAtt = AddAttribute<TypeAttribute>();
+ _flagsAtt = AddAttribute<FlagsAttribute>();
// ReSharper restore DoNotCallOverridableMethodsInConstructor
// set the input to be an empty token stream, we already have the data.
_input = new EmptyTokenStream();
- _inTermAtt = (TermAttribute) _input.AddAttribute(typeof (TermAttribute));
- _inPosIncrAtt = (PositionIncrementAttribute) _input.AddAttribute(typeof (PositionIncrementAttribute));
- _inPayloadAtt = (PayloadAttribute) _input.AddAttribute(typeof (PayloadAttribute));
- _inOffsetAtt = (OffsetAttribute) _input.AddAttribute(typeof (OffsetAttribute));
- _inTypeAtt = (TypeAttribute) _input.AddAttribute(typeof (TypeAttribute));
- _inFlagsAtt = (FlagsAttribute) _input.AddAttribute(typeof (FlagsAttribute));
+ _inTermAtt = _input.AddAttribute<TermAttribute>();
+ _inPosIncrAtt = _input.AddAttribute<PositionIncrementAttribute>();
+ _inPayloadAtt = _input.AddAttribute<PayloadAttribute>();
+ _inOffsetAtt = _input.AddAttribute<OffsetAttribute>();
+ _inTypeAtt = _input.AddAttribute<TypeAttribute>();
+ _inFlagsAtt = _input.AddAttribute<FlagsAttribute>();
}
/// <summary>
@@ -250,27 +251,27 @@ namespace Lucene.Net.Analyzers.Shingle
_settingsCodec = settingsCodec;
// ReSharper disable DoNotCallOverridableMethodsInConstructor
- _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
- _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
- _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
- _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
- _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
- _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
+ _termAtt = AddAttribute<TermAttribute>();
+ _posIncrAtt = AddAttribute<PositionIncrementAttribute>();
+ _payloadAtt = AddAttribute<PayloadAttribute>();
+ _offsetAtt = AddAttribute<OffsetAttribute>();
+ _typeAtt = AddAttribute<TypeAttribute>();
+ _flagsAtt = AddAttribute<FlagsAttribute>();
// ReSharper restore DoNotCallOverridableMethodsInConstructor
- _inTermAtt = (TermAttribute) input.AddAttribute(typeof (TermAttribute));
- _inPosIncrAtt = (PositionIncrementAttribute) input.AddAttribute(typeof (PositionIncrementAttribute));
- _inPayloadAtt = (PayloadAttribute) input.AddAttribute(typeof (PayloadAttribute));
- _inOffsetAtt = (OffsetAttribute) input.AddAttribute(typeof (OffsetAttribute));
- _inTypeAtt = (TypeAttribute) input.AddAttribute(typeof (TypeAttribute));
- _inFlagsAtt = (FlagsAttribute) input.AddAttribute(typeof (FlagsAttribute));
+ _inTermAtt = input.AddAttribute<TermAttribute>();
+ _inPosIncrAtt = input.AddAttribute<PositionIncrementAttribute>();
+ _inPayloadAtt = input.AddAttribute<PayloadAttribute>();
+ _inOffsetAtt = input.AddAttribute<OffsetAttribute>();
+ _inTypeAtt = input.AddAttribute<TypeAttribute>();
+ _inFlagsAtt = input.AddAttribute<FlagsAttribute>();
}
public int MinimumShingleSize { get; set; }
public int MaximumShingleSize { get; set; }
- public Matrix.Matrix Matrix { get; set; }
+ public Matrix Matrix { get; set; }
public Char? SpacerCharacter { get; set; }
@@ -283,11 +284,16 @@ namespace Lucene.Net.Analyzers.Shingle
_input.Reset();
}
+ protected override void Dispose(bool disposing)
+ {
+ // Do nothing
+ }
+
public override sealed bool IncrementToken()
{
if (Matrix == null)
{
- Matrix = new Matrix.Matrix();
+ Matrix = new Matrix();
// fill matrix with maximumShingleSize columns
while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn())
@@ -333,25 +339,16 @@ namespace Lucene.Net.Analyzers.Shingle
return token;
}
- /// <summary>
- /// Deprecated: Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <param name="reusableToken"></param>
- /// <returns></returns>
- [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
- public override sealed Token Next(Token reusableToken)
+ private Token GetNextToken(Token token)
{
- return base.Next(reusableToken);
- }
-
- /// <summary>
- /// Deprecated: Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
- /// </summary>
- /// <returns></returns>
- [Obsolete("The returned Token is a \"full private copy\" (not re-used across calls to Next()) but will be slower than calling Next(Token) or using the new IncrementToken() method with the new AttributeSource API.")]
- public override sealed Token Next()
- {
- return base.Next();
+ if (!this.IncrementToken()) return null;
+ token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
+ token.SetPositionIncrement(_posIncrAtt.GetPositionIncrement());
+ token.SetFlags(_flagsAtt.GetFlags());
+ token.SetOffset(_offsetAtt.StartOffset(), _offsetAtt.EndOffset());
+ token.SetType(_typeAtt.Type());
+ token.SetPayload(_payloadAtt.GetPayload());
+ return token;
}
/// <summary>
@@ -377,12 +374,12 @@ namespace Lucene.Net.Analyzers.Shingle
_currentShingleLength == 1 &&
(_currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast))
{
- return Next();
+ return GetNextToken(reusableToken);
}
var termLength = 0;
- var shingle = new SupportClass.EquatableList<Token>();
+ var shingle = new EquatableList<Token>();
for (int i = 0; i < _currentShingleLength; i++)
{
@@ -521,7 +518,7 @@ namespace Lucene.Net.Analyzers.Shingle
}
/// <summary>
- /// Final touch of a shingle token before it is passed on to the consumer from method <see cref="Next(Token)"/>.
+ /// Final touch of a shingle token before it is passed on to the consumer from method <see cref="IncrementToken()"/>.
///
/// Calculates and sets type, flags, position increment, start/end offsets and weight.
/// </summary>
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/WordlistLoader.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/WordlistLoader.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/WordlistLoader.cs Tue Feb 28 22:43:08 2012
@@ -1,125 +1,125 @@
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
-*/
-
-using System;
-using System.IO;
-using System.Collections;
-
-namespace Lucene.Net.Analysis
-{
- /// <summary>
- /// Loads a text file and adds every line as an entry to a Hashtable. Every line
- /// should contain only one word. If the file is not found or on any error, an
- /// empty table is returned.
- /// </summary>
- public class WordlistLoader
- {
- /// <summary>
- /// Load words table from the file
- /// </summary>
- /// <param name="path">Path to the wordlist</param>
- /// <param name="wordfile">Name of the wordlist</param>
- /// <returns></returns>
- public static Hashtable GetWordtable( String path, String wordfile )
- {
- if ( path == null || wordfile == null )
- {
- return new Hashtable();
- }
- return GetWordtable(new FileInfo(path + "\\" + wordfile));
- }
-
- /// <summary>
- /// Load words table from the file
- /// </summary>
- /// <param name="wordfile">Complete path to the wordlist</param>
- /// <returns></returns>
- public static Hashtable GetWordtable( String wordfile )
- {
- if ( wordfile == null )
- {
- return new Hashtable();
- }
- return GetWordtable( new FileInfo( wordfile ) );
- }
-
- /// <summary>
- /// Load words table from the file
- /// </summary>
- /// <param name="wordfile">File containing the wordlist</param>
- /// <returns></returns>
- public static Hashtable GetWordtable( FileInfo wordfile )
- {
- if ( wordfile == null )
- {
- return new Hashtable();
- }
- StreamReader lnr = new StreamReader(wordfile.FullName);
- return GetWordtable(lnr);
- }
-
- /// <summary>
- /// Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
- /// leading and trailing whitespace). Every line of the Reader should contain only
- /// one word. The words need to be in lowercase if you make use of an
- /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- /// </summary>
- /// <param name="reader">Reader containing the wordlist</param>
- /// <returns>A Hashtable with the reader's words</returns>
- public static Hashtable GetWordtable(TextReader reader)
- {
- Hashtable result = new Hashtable();
- try
- {
- ArrayList stopWords = new ArrayList();
- String word = null;
- while ( ( word = reader.ReadLine() ) != null )
- {
- stopWords.Add(word.Trim());
- }
- result = MakeWordTable( (String[])stopWords.ToArray(typeof(string)), stopWords.Count);
- }
- // On error, use an empty table
- catch (IOException)
- {
- result = new Hashtable();
- }
- return result;
- }
-
-
- /// <summary>
- /// Builds the wordlist table.
- /// </summary>
- /// <param name="words">Word that where read</param>
- /// <param name="length">Amount of words that where read into <tt>words</tt></param>
- /// <returns></returns>
- private static Hashtable MakeWordTable( String[] words, int length )
- {
- Hashtable table = new Hashtable( length );
- for ( int i = 0; i < length; i++ )
- {
- table.Add(words[i], words[i]);
- }
- return table;
- }
- }
-}
\ No newline at end of file
+///*
+// *
+// * Licensed to the Apache Software Foundation (ASF) under one
+// * or more contributor license agreements. See the NOTICE file
+// * distributed with this work for additional information
+// * regarding copyright ownership. The ASF licenses this file
+// * to you under the Apache License, Version 2.0 (the
+// * "License"); you may not use this file except in compliance
+// * with the License. You may obtain a copy of the License at
+// *
+// * http://www.apache.org/licenses/LICENSE-2.0
+// *
+// * Unless required by applicable law or agreed to in writing,
+// * software distributed under the License is distributed on an
+// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// * KIND, either express or implied. See the License for the
+// * specific language governing permissions and limitations
+// * under the License.
+// *
+//*/
+
+//using System;
+//using System.IO;
+//using System.Collections;
+
+//namespace Lucene.Net.Analysis
+//{
+// /// <summary>
+// /// Loads a text file and adds every line as an entry to a Hashtable. Every line
+// /// should contain only one word. If the file is not found or on any error, an
+// /// empty table is returned.
+// /// </summary>
+// public class WordlistLoader
+// {
+// /// <summary>
+// /// Load words table from the file
+// /// </summary>
+// /// <param name="path">Path to the wordlist</param>
+// /// <param name="wordfile">Name of the wordlist</param>
+// /// <returns></returns>
+// public static Hashtable GetWordSet( String path, String wordfile )
+// {
+// if ( path == null || wordfile == null )
+// {
+// return new Hashtable();
+// }
+// return GetWordSet(new FileInfo(path + "\\" + wordfile));
+// }
+
+// /// <summary>
+// /// Load words table from the file
+// /// </summary>
+// /// <param name="wordfile">Complete path to the wordlist</param>
+// /// <returns></returns>
+// public static Hashtable GetWordSet( String wordfile )
+// {
+// if ( wordfile == null )
+// {
+// return new Hashtable();
+// }
+// return GetWordSet( new FileInfo( wordfile ) );
+// }
+
+// /// <summary>
+// /// Load words table from the file
+// /// </summary>
+// /// <param name="wordfile">File containing the wordlist</param>
+// /// <returns></returns>
+// public static Hashtable GetWordSet( FileInfo wordfile )
+// {
+// if ( wordfile == null )
+// {
+// return new Hashtable();
+// }
+// StreamReader lnr = new StreamReader(wordfile.FullName);
+// return GetWordSet(lnr);
+// }
+
+// /// <summary>
+// /// Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
+// /// leading and trailing whitespace). Every line of the Reader should contain only
+// /// one word. The words need to be in lowercase if you make use of an
+// /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+// /// </summary>
+// /// <param name="reader">Reader containing the wordlist</param>
+// /// <returns>A Hashtable with the reader's words</returns>
+// public static Hashtable GetWordSet(TextReader reader)
+// {
+// Hashtable result = new Hashtable();
+// try
+// {
+// ArrayList stopWords = new ArrayList();
+// String word = null;
+// while ( ( word = reader.ReadLine() ) != null )
+// {
+// stopWords.Add(word.Trim());
+// }
+// result = MakeWordTable( (String[])stopWords.ToArray(typeof(string)), stopWords.Count);
+// }
+// // On error, use an empty table
+// catch (IOException)
+// {
+// result = new Hashtable();
+// }
+// return result;
+// }
+
+
+// /// <summary>
+// /// Builds the wordlist table.
+// /// </summary>
+// /// <param name="words">Word that where read</param>
+// /// <param name="length">Amount of words that where read into <tt>words</tt></param>
+// /// <returns></returns>
+// private static Hashtable MakeWordTable( String[] words, int length )
+// {
+// Hashtable table = new Hashtable( length );
+// for ( int i = 0; i < length; i++ )
+// {
+// table.Add(words[i], words[i]);
+// }
+// return table;
+// }
+// }
+//}
Modified: incubator/lucene.net/trunk/src/contrib/Core/Analysis/Ext/Analysis.Ext.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Core/Analysis/Ext/Analysis.Ext.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
Binary files - no diff available.
Modified: incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldQuery.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldQuery.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldQuery.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldQuery.cs Tue Feb 28 22:43:08 2012
@@ -78,10 +78,8 @@ namespace Lucene.Net.Search.Vectorhighli
else if (sourceQuery is DisjunctionMaxQuery)
{
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
- System.Collections.IEnumerator en = dmq.Iterator();
- while (en.MoveNext())
+ foreach(Query query in dmq)
{
- Query query = (Query)en.Current;
flatten(query, flatQueries);
}
}
Modified: incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldTermStack.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldTermStack.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldTermStack.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/FastVectorHighlighter/FieldTermStack.cs Tue Feb 28 22:43:08 2012
@@ -42,7 +42,7 @@ namespace Lucene.Net.Search.Vectorhighli
public static void Main(String[] args)
{
Analyzer analyzer = new WhitespaceAnalyzer();
- QueryParser parser = new QueryParser("f", analyzer);
+ QueryParser parser = new QueryParser(Util.Version.LUCENE_CURRENT, "f", analyzer);
Query query = parser.Parse("a x:b");
FieldQuery fieldQuery = new FieldQuery(query, true, false);
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs Tue Feb 28 22:43:08 2012
@@ -16,10 +16,10 @@
*/
using System;
+using Lucene.Net.Util;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
-using PriorityQueue = Lucene.Net.Util.PriorityQueue;
namespace Lucene.Net.Highlight
{
@@ -198,151 +198,152 @@ namespace Lucene.Net.Highlight
/// <throws> IOException </throws>
public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments)
{
- System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
- System.Text.StringBuilder newText = new System.Text.StringBuilder();
+ //System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
+ //System.Text.StringBuilder newText = new System.Text.StringBuilder();
- TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
- fragmentScorer.StartFragment(currentFrag);
- docFrags.Add(currentFrag);
+ //TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
+ //fragmentScorer.StartFragment(currentFrag);
+ //docFrags.Add(currentFrag);
- FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
+ //FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
- try
- {
- Lucene.Net.Analysis.Token token;
- System.String tokenText;
- int startOffset;
- int endOffset;
- int lastEndOffset = 0;
- textFragmenter.Start(text);
+ //try
+ //{
+ // Lucene.Net.Analysis.Token token;
+ // System.String tokenText;
+ // int startOffset;
+ // int endOffset;
+ // int lastEndOffset = 0;
+ // textFragmenter.Start(text);
- TokenGroup tokenGroup = new TokenGroup();
- token = tokenStream.Next();
- while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze))
- {
- if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
- {
- //the current token is distinct from previous tokens -
- // markup the cached token group info
- startOffset = tokenGroup.matchStartOffset;
- endOffset = tokenGroup.matchEndOffset;
- tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
- System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
- //store any whitespace etc from between this and last group
- if (startOffset > lastEndOffset)
- newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
- newText.Append(markedUpText);
- lastEndOffset = System.Math.Max(endOffset, lastEndOffset);
- tokenGroup.Clear();
+ // TokenGroup tokenGroup = new TokenGroup();
+ // token = tokenStream.Next();
+ // while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze))
+ // {
+ // if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
+ // {
+ // //the current token is distinct from previous tokens -
+ // // markup the cached token group info
+ // startOffset = tokenGroup.matchStartOffset;
+ // endOffset = tokenGroup.matchEndOffset;
+ // tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
+ // System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
+ // //store any whitespace etc from between this and last group
+ // if (startOffset > lastEndOffset)
+ // newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
+ // newText.Append(markedUpText);
+ // lastEndOffset = System.Math.Max(endOffset, lastEndOffset);
+ // tokenGroup.Clear();
- //check if current token marks the start of a new fragment
- if (textFragmenter.IsNewFragment(token))
- {
- currentFrag.SetScore(fragmentScorer.GetFragmentScore());
- //record stats for a new fragment
- currentFrag.textEndPos = newText.Length;
- currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
- fragmentScorer.StartFragment(currentFrag);
- docFrags.Add(currentFrag);
- }
- }
+ // //check if current token marks the start of a new fragment
+ // if (textFragmenter.IsNewFragment(token))
+ // {
+ // currentFrag.SetScore(fragmentScorer.GetFragmentScore());
+ // //record stats for a new fragment
+ // currentFrag.textEndPos = newText.Length;
+ // currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
+ // fragmentScorer.StartFragment(currentFrag);
+ // docFrags.Add(currentFrag);
+ // }
+ // }
- tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));
+ // tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));
- // if(lastEndOffset>maxDocBytesToAnalyze)
- // {
- // break;
- // }
- token = tokenStream.Next();
- }
- currentFrag.SetScore(fragmentScorer.GetFragmentScore());
+ // // if(lastEndOffset>maxDocBytesToAnalyze)
+ // // {
+ // // break;
+ // // }
+ // token = tokenStream.Next();
+ // }
+ // currentFrag.SetScore(fragmentScorer.GetFragmentScore());
- if (tokenGroup.numTokens > 0)
- {
- //flush the accumulated text (same code as in above loop)
- startOffset = tokenGroup.matchStartOffset;
- endOffset = tokenGroup.matchEndOffset;
- tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
- System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
- //store any whitespace etc from between this and last group
- if (startOffset > lastEndOffset)
- newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
- newText.Append(markedUpText);
- lastEndOffset = System.Math.Max(lastEndOffset, endOffset);
- }
+ // if (tokenGroup.numTokens > 0)
+ // {
+ // //flush the accumulated text (same code as in above loop)
+ // startOffset = tokenGroup.matchStartOffset;
+ // endOffset = tokenGroup.matchEndOffset;
+ // tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
+ // System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
+ // //store any whitespace etc from between this and last group
+ // if (startOffset > lastEndOffset)
+ // newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
+ // newText.Append(markedUpText);
+ // lastEndOffset = System.Math.Max(lastEndOffset, endOffset);
+ // }
- //Test what remains of the original text beyond the point where we stopped analyzing
- if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze))
- {
- //append it to the last fragment
- newText.Append(encoder.EncodeText(text.Substring(lastEndOffset)));
- }
+ // //Test what remains of the original text beyond the point where we stopped analyzing
+ // if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze))
+ // {
+ // //append it to the last fragment
+ // newText.Append(encoder.EncodeText(text.Substring(lastEndOffset)));
+ // }
- currentFrag.textEndPos = newText.Length;
+ // currentFrag.textEndPos = newText.Length;
- //sort the most relevant sections of the text
- for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
- {
- currentFrag = (TextFragment) i.Current;
+ // //sort the most relevant sections of the text
+ // for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
+ // {
+ // currentFrag = (TextFragment) i.Current;
- //If you are running with a version of Lucene before 11th Sept 03
- // you do not have PriorityQueue.insert() - so uncomment the code below
- /*
- if (currentFrag.getScore() >= minScore)
- {
- fragQueue.put(currentFrag);
- if (fragQueue.size() > maxNumFragments)
- { // if hit queue overfull
- fragQueue.pop(); // remove lowest in hit queue
- minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
- }
+ // //If you are running with a version of Lucene before 11th Sept 03
+ // // you do not have PriorityQueue.insert() - so uncomment the code below
+ // /*
+ // if (currentFrag.getScore() >= minScore)
+ // {
+ // fragQueue.put(currentFrag);
+ // if (fragQueue.size() > maxNumFragments)
+ // { // if hit queue overfull
+ // fragQueue.pop(); // remove lowest in hit queue
+ // minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
+ // }
- }
- */
- //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
- //fix to PriorityQueue. The correct method to use here is the new "insert" method
- // USE ABOVE CODE IF THIS DOES NOT COMPILE!
- fragQueue.Insert(currentFrag);
- }
+ // }
+ // */
+ // //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+ // //fix to PriorityQueue. The correct method to use here is the new "insert" method
+ // // USE ABOVE CODE IF THIS DOES NOT COMPILE!
+ // fragQueue.Insert(currentFrag);
+ // }
- //return the most relevant fragments
- TextFragment[] frag = new TextFragment[fragQueue.Size()];
- for (int i = frag.Length - 1; i >= 0; i--)
- {
- frag[i] = (TextFragment) fragQueue.Pop();
- }
+ // //return the most relevant fragments
+ // TextFragment[] frag = new TextFragment[fragQueue.Size()];
+ // for (int i = frag.Length - 1; i >= 0; i--)
+ // {
+ // frag[i] = (TextFragment) fragQueue.Pop();
+ // }
- //merge any contiguous fragments to improve readability
- if (mergeContiguousFragments)
- {
- MergeContiguousFragments(frag);
- System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
- for (int i = 0; i < frag.Length; i++)
- {
- if ((frag[i] != null) && (frag[i].GetScore() > 0))
- {
- fragTexts.Add(frag[i]);
- }
- }
- frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
- }
+ // //merge any contiguous fragments to improve readability
+ // if (mergeContiguousFragments)
+ // {
+ // MergeContiguousFragments(frag);
+ // System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
+ // for (int i = 0; i < frag.Length; i++)
+ // {
+ // if ((frag[i] != null) && (frag[i].GetScore() > 0))
+ // {
+ // fragTexts.Add(frag[i]);
+ // }
+ // }
+ // frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
+ // }
- return frag;
- }
- finally
- {
- if (tokenStream != null)
- {
- try
- {
- tokenStream.Close();
- }
- catch (System.Exception e)
- {
- }
- }
- }
+ // return frag;
+ //}
+ //finally
+ //{
+ // if (tokenStream != null)
+ // {
+ // try
+ // {
+ // tokenStream.Close();
+ // }
+ // catch (System.Exception e)
+ // {
+ // }
+ // }
+ //}
+ throw new NotImplementedException("Not yet ported to 3.0.3");
}
@@ -507,21 +508,19 @@ namespace Lucene.Net.Highlight
}
}
- class FragmentQueue : PriorityQueue
+ class FragmentQueue : PriorityQueue<TextFragment>
{
public FragmentQueue(int size)
{
Initialize(size);
}
-
- public override bool LessThan(System.Object a, System.Object b)
+
+ public override bool LessThan(TextFragment a, TextFragment b)
{
- TextFragment fragA = (TextFragment) a;
- TextFragment fragB = (TextFragment) b;
- if (fragA.GetScore() == fragB.GetScore())
- return fragA.fragNum > fragB.fragNum;
+ if (a.GetScore() == b.GetScore())
+ return a.fragNum > b.fragNum;
else
- return fragA.GetScore() < fragB.GetScore();
+ return a.GetScore() < b.GetScore();
}
}
}
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs Tue Feb 28 22:43:08 2012
@@ -102,7 +102,7 @@ namespace Lucene.Net.Highlight
*/
public virtual float GetTokenScore(Token token)
{
- System.String termText = token.TermText();
+ System.String termText = token.Term();
WeightedTerm queryTerm = (WeightedTerm) termsToFind[termText];
if (queryTerm == null)
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs Tue Feb 28 22:43:08 2012
@@ -16,6 +16,7 @@
*/
using System;
+using System.Collections.Generic;
using IndexReader = Lucene.Net.Index.IndexReader;
using Term = Lucene.Net.Index.Term;
using BooleanClause = Lucene.Net.Search.BooleanClause;
@@ -134,16 +135,14 @@ namespace Lucene.Net.Highlight
GetTermsFromFilteredQuery((FilteredQuery) query, terms, prohibited, fieldName);
else
{
- System.Collections.Hashtable nonWeightedTerms = new System.Collections.Hashtable();
+ var nonWeightedTerms = new HashSet<Term>();
query.ExtractTerms(nonWeightedTerms);
- System.Collections.IDictionaryEnumerator iter = nonWeightedTerms.GetEnumerator();
- while (iter.MoveNext())
+ foreach (var term in nonWeightedTerms)
{
- Term term = (Term)iter.Value;
if ((fieldName == null) || (term.Field() == fieldName))
{
- WeightedTerm temp = new WeightedTerm(query.GetBoost(), term.Text());
+ var temp = new WeightedTerm(query.GetBoost(), term.Text());
terms.Add(temp, temp);
}
}
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/TokenSources.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/TokenSources.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/TokenSources.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/TokenSources.cs Tue Feb 28 22:43:08 2012
@@ -19,6 +19,7 @@
* Created on 28-Oct-2004
*/
using System;
+using Lucene.Net.Analysis.Tokenattributes;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
@@ -43,18 +44,32 @@ namespace Lucene.Net.Highlight
{
internal Token[] tokens;
internal int currentToken = 0;
+ TermAttribute termAtt;
+ OffsetAttribute offsetAtt;
+
internal StoredTokenStream(Token[] tokens)
{
this.tokens = tokens;
+ termAtt = AddAttribute<TermAttribute>();
+ offsetAtt = AddAttribute<OffsetAttribute>();
}
- public override Token Next()
+ public override bool IncrementToken()
{
- if (currentToken >= tokens.Length)
- {
- return null;
- }
- return tokens[currentToken++];
+ if (currentToken >= tokens.Length)
+ {
+ return false;
+ }
+ ClearAttributes();
+ Token token = tokens[currentToken++];
+ termAtt.SetTermBuffer(token.Term());
+ offsetAtt.SetOffset(token.StartOffset(), token.EndOffset());
+ return true;
}
+
+ protected override void Dispose(bool disposing)
+ {
+ // do nothing
+ }
}
private class AnonymousClassComparator : System.Collections.IComparer
{
Modified: incubator/lucene.net/trunk/src/contrib/Queries/BooleanFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Queries/BooleanFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Queries/BooleanFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Queries/BooleanFilter.cs Tue Feb 28 22:43:08 2012
@@ -21,6 +21,7 @@ using System.Linq;
using System.Text;
using Lucene.Net.Index;
+using Lucene.Net.Support;
using Lucene.Net.Util;
namespace Lucene.Net.Search
@@ -51,7 +52,7 @@ namespace Lucene.Net.Search
/// <returns></returns>
private DocIdSetIterator GetDISI(List<Filter> filters, int index, IndexReader reader)
{
- return ((Filter)filters[index]).GetDocIdSet(reader).Iterator();
+ return filters[index].GetDocIdSet(reader).Iterator();
}
/// <summary>
@@ -73,7 +74,7 @@ namespace Lucene.Net.Search
}
else
{
- DocIdSet dis = ((Filter)shouldFilters[i]).GetDocIdSet(reader);
+ DocIdSet dis = shouldFilters[i].GetDocIdSet(reader);
if (dis is OpenBitSet)
{
// optimized case for OpenBitSets
@@ -98,7 +99,7 @@ namespace Lucene.Net.Search
}
else
{
- DocIdSet dis = ((Filter)notFilters[i]).GetDocIdSet(reader);
+ DocIdSet dis = notFilters[i].GetDocIdSet(reader);
if (dis is OpenBitSet)
{
// optimized case for OpenBitSets
@@ -122,7 +123,7 @@ namespace Lucene.Net.Search
}
else
{
- DocIdSet dis = ((Filter)mustFilters[i]).GetDocIdSet(reader);
+ DocIdSet dis = mustFilters[i].GetDocIdSet(reader);
if (dis is OpenBitSet)
{
// optimized case for OpenBitSets
@@ -139,58 +140,52 @@ namespace Lucene.Net.Search
if (res != null)
return FinalResult(res, reader.MaxDoc());
- else
- {
- //TODO: 2.- change return DocIdSet.EMPTY_DOCIDSET;
- return null;
- }
+ return DocIdSet.EMPTY_DOCIDSET;
+ }
+
+ /** Provide a SortedVIntList when it is definitely smaller
+ * than an OpenBitSet.
+ * @deprecated Either use CachingWrapperFilter, or
+ * switch to a different DocIdSet implementation yourself.
+ * This method will be removed in Lucene 4.0
+ */
+ protected DocIdSet FinalResult(OpenBitSetDISI result, int maxDocs)
+ {
+ return result;
}
/// <summary>
/// Add a filter clause.
/// </summary>
/// <param name="filterClause">The clause to add.</param>
- public void Add(BooleanFilterClause filterClause)
+ public void Add(FilterClause filterClause)
{
- if (filterClause.Occur == BooleanClause.Occur.MUST)
+ if (filterClause.GetOccur() == BooleanClause.Occur.MUST)
{
if (mustFilters == null)
{
- mustFilters = new List<Filter>();
+ mustFilters = new EquatableList<Filter>();
}
- mustFilters.Add(filterClause.Filter);
+ mustFilters.Add(filterClause.GetFilter());
}
- if (filterClause.Occur == BooleanClause.Occur.SHOULD)
+ if (filterClause.GetOccur() == BooleanClause.Occur.SHOULD)
{
if (shouldFilters == null)
{
- shouldFilters = new List<Filter>();
+ shouldFilters = new EquatableList<Filter>();
}
- shouldFilters.Add(filterClause.Filter);
+ shouldFilters.Add(filterClause.GetFilter());
}
- if (filterClause.Occur == BooleanClause.Occur.MUST_NOT)
+ if (filterClause.GetOccur() == BooleanClause.Occur.MUST_NOT)
{
if (notFilters == null)
{
- notFilters = new List<Filter>();
+ notFilters = new EquatableList<Filter>();
}
- notFilters.Add(filterClause.Filter);
+ notFilters.Add(filterClause.GetFilter());
}
}
- // TODO: in 3.0, instead of removing this deprecated
- // method, make it a no-op and mark it final
- /** Provide a SortedVIntList when it is definitely smaller
- * than an OpenBitSet.
- * @deprecated Either use CachingWrapperFilter, or
- * switch to a different DocIdSet implementation yourself. */
- protected DocIdSet FinalResult(OpenBitSetDISI result, int maxDocs)
- {
- return (result.Cardinality() < (maxDocs / 9))
- ? (DocIdSet)new SortedVIntList(result)
- : (DocIdSet)result;
- }
-
/// <summary>
/// Determine equality between two lists.
/// </summary>
@@ -283,38 +278,4 @@ namespace Lucene.Net.Search
}
}
}
-
- /// <summary>
- /// A spefic clause that makes up a part of the BooleanFilter
- /// </summary>
- public class BooleanFilterClause
- {
- /// <summary>
- /// Create a new BooleanFilterClause
- /// </summary>
- /// <param name="filter">A Filter object</param>
- /// <param name="occur">A parameter implementation indicating SHOULD, MUST or MUST NOT</param>
- public BooleanFilterClause(Filter filter, BooleanClause.Occur occur)
- {
- this.Occur = occur;
- this.Filter = filter;
- }
-
- /// <summary>
- /// The underlying filter for the clause.
- /// </summary>
- public Filter Filter
- {
- get;
- private set;
- }
- /// <summary>
- /// The occurrence of this clause.
- /// </summary>
- public BooleanClause.Occur Occur
- {
- get;
- private set;
- }
- }
}
Modified: incubator/lucene.net/trunk/src/contrib/Queries/Contrib.Queries.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Queries/Contrib.Queries.csproj?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Queries/Contrib.Queries.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Queries/Contrib.Queries.csproj Tue Feb 28 22:43:08 2012
@@ -19,7 +19,6 @@
under the License.
-->
-
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
@@ -92,7 +91,6 @@
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Similar\MoreLikeThisQuery.cs" />
<Compile Include="Similar\SimilarityQueries.cs" />
- <Compile Include="Support.cs" />
<Compile Include="TermsFilter.cs" />
</ItemGroup>
<ItemGroup>
@@ -126,6 +124,9 @@
<ItemGroup>
<None Include="Lucene.Net.snk" />
</ItemGroup>
+ <ItemGroup>
+ <Content Include="FileDiffs.txt" />
+ </ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
Modified: incubator/lucene.net/trunk/src/contrib/Queries/DuplicateFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Queries/DuplicateFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Queries/DuplicateFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Queries/DuplicateFilter.cs Tue Feb 28 22:43:08 2012
@@ -28,7 +28,6 @@ namespace Lucene.Net.Search
{
public class DuplicateFilter : Filter
{
-
String fieldName;
/**
@@ -82,7 +81,6 @@ namespace Lucene.Net.Search
private OpenBitSet CorrectBits(IndexReader reader)
{
-
OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); //assume all are INvalid
Term startTerm = new Term(fieldName);
TermEnum te = reader.Terms(startTerm);
@@ -121,7 +119,6 @@ namespace Lucene.Net.Search
private OpenBitSet FastBits(IndexReader reader)
{
-
OpenBitSet bits = new OpenBitSet(reader.MaxDoc());
bits.Set(0, reader.MaxDoc()); //assume all are valid
Term startTerm = new Term(fieldName);
@@ -163,28 +160,6 @@ namespace Lucene.Net.Search
return bits;
}
- // /**
- // * <param name="args"></param>
- // * @throws IOException
- // * @throws Exception
- // */
- // public static void main(String[] args)
- // {
- // IndexReader r=IndexReader.open("/indexes/personCentricAnon");
- //// IndexReader r=IndexReader.open("/indexes/enron");
- // long start=System.currentTimeMillis();
- //// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
- //// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
- // DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
- //// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
- //// df.setProcessingMode(PM_SLOW_VALIDATION);
- // BitSet b = df.bits(r);
- // long end=System.currentTimeMillis()-start;
- // System.out.println(b.cardinality()+" in "+end+" ms ");
-
- // }
-
-
public String GetFieldName()
{
return fieldName;
Modified: incubator/lucene.net/trunk/src/contrib/Queries/FilterClause.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Queries/FilterClause.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Queries/FilterClause.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Queries/FilterClause.cs Tue Feb 28 22:43:08 2012
@@ -22,11 +22,45 @@ using System.Text;
namespace Lucene.Net.Search
{
- class FilterClause
+ /**
+ * A Filter that wrapped with an indication of how that filter
+ * is used when composed with another filter.
+ * (Follows the boolean logic in BooleanClause for composition
+ * of queries.)
+ */
+ [Serializable]
+ public class FilterClause
{
- public FilterClause()
+ BooleanClause.Occur occur;
+ Filter filter;
+
+ /**
+ * Create a new FilterClause
+ * @param filter A Filter object containing a BitSet
+ * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
+ */
+ public FilterClause(Filter filter, BooleanClause.Occur occur)
+ {
+ this.occur = occur;
+ this.filter = filter;
+ }
+
+ /**
+ * Returns this FilterClause's filter
+ * @return A Filter object
+ */
+ public Filter GetFilter()
+ {
+ return filter;
+ }
+
+ /**
+ * Returns this FilterClause's occur parameter
+ * @return An Occur object
+ */
+ public BooleanClause.Occur GetOccur()
{
- throw new NotImplementedException("Not implemented yet.");
+ return occur;
}
}
-}
+}
\ No newline at end of file