You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2011/09/05 20:54:42 UTC
[Lucene.Net] svn commit: r1165386 - in
/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net:
Analysis/Analyzer.cs Analysis/ReusableAnalyzerBase.cs Analysis/Tokenizer.cs
Lucene.Net.csproj
Author: mherndon
Date: Mon Sep 5 18:54:42 2011
New Revision: 1165386
URL: http://svn.apache.org/viewvc?rev=1165386&view=rev
Log:
adding the first passes of ResuableAnalyzerBase & Tokenizer classes. will add tests once I have time for implementing some of the analzyers from the contrib/module projects
Added:
incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs
incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs
Modified:
incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs
incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj
Modified: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs?rev=1165386&r1=1165385&r2=1165386&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs Mon Sep 5 18:54:42 2011
@@ -32,7 +32,7 @@ namespace Lucene.Net.Analysis
/// </summary>
public abstract class Analyzer : IDisposable
{
- private ThreadLocal<TokenStream> threadLocalTokenStream;
+ private ThreadLocal<object> threadLocalTokenStream;
private bool disposed = false;
/// <summary>
@@ -59,13 +59,22 @@ namespace Lucene.Net.Analysis
/// <summary>
- /// Gets or sets the previous token stream.
+ /// Gets or sets the previous token stream or token stream storage object.
/// </summary>
+ /// <remarks>
+ /// <para>
+ /// This can be used to store the previous token stream directly or it can
+ /// use a custom storage mechanism like <see cref="ReusableAnalyzerBase.TokenStreamComponents"/>.
+ /// </para>
+ /// <para>
+ /// The property name deviates from the Java version because the name is misleading.
+ /// </para>
+ /// </remarks>
/// <value>The previous token stream. Returns null if the value has not been set.</value>
/// <exception cref="ObjectDisposedException">
/// Thrown when <see cref="Analyzer"/> is already disposed.
/// </exception>
- protected TokenStream PreviousTokenStream
+ protected object PreviousTokenStreamOrStorage
{
get
{
@@ -109,24 +118,38 @@ namespace Lucene.Net.Analysis
}
/// <summary>
- /// Tokens the stream.
+ /// Creates a <see cref="TokenStream"/> using the specified <see cref="StreamReader"/>.
/// </summary>
+ /// <remarks>
+ /// <para>
+ /// Subclasses that implement this method should always be able to handle null
+ /// values for the field name for backwards compatibility.
+ /// </para>
+ /// </remarks>
/// <param name="fieldName">Name of the field.</param>
/// <param name="reader">The reader.</param>
/// <returns>
/// An instance of <see cref="TokenStream"/>.
/// </returns>
- public abstract TokenStream TokenStream(string fieldName, TextReader reader);
+ public abstract TokenStream TokenStream(string fieldName, StreamReader reader);
/// <summary>
- /// Reusable the token stream.
+ /// Finds or creates a <see cref="TokenStream"/> that is permits the <see cref="TokenStream"/>
+ /// to be re-used on the same thread.
/// </summary>
+ /// <remarks>
+ /// <para>
+ /// Any Class that manages the current <see cref="Analyzer"/> and does not need to use more
+ /// than one <see cref="TokenStream"/> at the same time should use this method for
+ /// better performance.
+ /// </para>
+ /// </remarks>
/// <param name="fieldName">Name of the field.</param>
/// <param name="reader">The reader.</param>
/// <returns>
/// An instance of <see cref="TokenStream"/>.
/// </returns>
- public TokenStream ReusableTokenStream(string fieldName, TextReader reader)
+ public virtual TokenStream ReusableTokenStream(string fieldName, StreamReader reader)
{
return this.TokenStream(fieldName, reader);
}
Added: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs?rev=1165386&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs Mon Sep 5 18:54:42 2011
@@ -0,0 +1,179 @@
+// -----------------------------------------------------------------------
+// <copyright company="Apache" file="ReusableAnalyzerBase.cs">
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// </copyright>
+// -----------------------------------------------------------------------
+
+namespace Lucene.Net.Analysis
+{
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+ using System.Linq;
+ using System.Text;
+
+ /// <summary>
+ /// A subclass of <see cref="Analyzer"/> for the purpose of making it easier to implement
+ /// <see cref="TokenStream"/> re-use.
+ /// </summary>
+ /// <remarks>
+ /// <para>
+ /// <see cref="ReusableAnalyzerBase"/> is meant to support easy re-use of <see cref="TokenStream"/>
+ /// for the most common use-cases. Analyzers like <c>PerFieldAnalyzerWraper</c> that behave
+ /// differently depending upon the field name may need to subclass <see cref="Analyzer"/>
+ /// directly.
+ /// </para>
+ /// <note>Subclasses must implement <see cref="CreateComponents(string, StreamReader)"/>.</note>
+ /// <para>
+ /// For consistency, this class does not allow subclasses to extend
+ /// <see cref="ReusableTokenStream(string, StreamReader)"/> or <see cref="TokenStream(string, StreamReader)"/>
+ /// directly.
+ /// </para>
+ /// </remarks>
+ public abstract class ReusableAnalyzerBase : Analyzer
+ {
+ /// <summary>
+ /// Finds or creates a <see cref="TokenStream"/> that is permits the <see cref="TokenStream"/>
+ /// to be re-used on the same thread.
+ /// </summary>
+ /// <remarks>
+ /// <para>
+ /// The overridden behavior of this method is to check <see cref="Analyzer.PreviousTokenStreamOrStorage"/>
+ /// to see if a <see cref="TokenStreamComponents"/> object is already stored there. If not, it creates
+ /// a new instance of <see cref="TokenStreamComponents"/> and stores it in <see cref="Analyzer.PreviousTokenStreamOrStorage"/>.
+ /// The <see cref="TokenStream" /> held inside the current the <see cref="TokenStreamComponents"/>
+ /// instance is then returned.
+ /// </para>
+ /// </remarks>
+ /// <param name="fieldName">Name of the field.</param>
+ /// <param name="reader">The reader.</param>
+ /// <returns>
+ /// An instance of <see cref="TokenStream"/>.
+ /// </returns>
+ public sealed override TokenStream ReusableTokenStream(string fieldName, StreamReader reader)
+ {
+ var components = this.PreviousTokenStreamOrStorage as TokenStreamComponents;
+ var initializedReader = this.InitializeReader(reader);
+
+ if (components == null || !components.Reset(initializedReader))
+ {
+ components = this.CreateComponents(fieldName, initializedReader);
+ this.PreviousTokenStreamOrStorage = components;
+ }
+
+ return components.TokenStream;
+ }
+
+ /// <summary>
+ /// Creates a <see cref="TokenStream"/> using the specified <see cref="StreamReader"/>.
+ /// </summary>
+ /// <param name="fieldName">Name of the field.</param>
+ /// <param name="reader">The reader.</param>
+ /// <returns>
+ /// An instance of <see cref="TokenStream"/>.
+ /// </returns>
+ /// <remarks>
+ /// Subclasses that implement this method should always be able to handle null
+ /// values for the field name for backwards compatibility.
+ /// </remarks>
+ public sealed override TokenStream TokenStream(string fieldName, StreamReader reader)
+ {
+ var initializedReader = this.InitializeReader(reader);
+ var components = this.CreateComponents(fieldName, initializedReader);
+
+ return components.TokenStream;
+ }
+
+ /// <summary>
+ /// Creates a new instance of <see cref="TokenStreamComponents"/>.
+ /// </summary>
+ /// <param name="fieldName">Name of the file.</param>
+ /// <param name="reader">The reader.</param>
+ /// <returns>
+ /// An instance of <see cref="TokenStreamComponents"/>.
+ /// </returns>
+ protected abstract TokenStreamComponents CreateComponents(string fieldName, StreamReader reader);
+
+ /// <summary>
+ /// Initializes the <paramref name="reader"/>.
+ /// </summary>
+ /// <param name="reader">The reader.</param>
+ /// <returns>
+ /// An instance of <see cref="StreamReader"/>.
+ /// </returns>
+ protected virtual StreamReader InitializeReader(StreamReader reader)
+ {
+ return reader;
+ }
+
+ /// <summary>
+ /// The components of a <see cref="TokenStream"/>. This class
+ /// provides access to the <see cref="Tokenizer"/> source and the outer end.
+ /// The outer end is instance of <c>TokenFilter</c> which is also a <see cref="TokenStream"/>.
+ /// </summary>
+ protected internal class TokenStreamComponents
+ {
+ private readonly Tokenizer tokenizer;
+ private readonly TokenStream tokenStream;
+
+ /// <summary>
+ /// Initializes a new instance of the <see cref="TokenStreamComponents"/> class.
+ /// </summary>
+ /// <param name="tokenizer">The tokenizer.</param>
+ public TokenStreamComponents(Tokenizer tokenizer)
+ : this(tokenizer, tokenizer)
+ {
+ }
+
+ /// <summary>
+ /// Initializes a new instance of the <see cref="TokenStreamComponents"/> class.
+ /// </summary>
+ /// <param name="tokenizer">The tokenizer.</param>
+ /// <param name="tokenStream">The token stream.</param>
+ public TokenStreamComponents(Tokenizer tokenizer, TokenStream tokenStream)
+ {
+ this.tokenizer = tokenizer;
+ this.tokenStream = tokenStream;
+ }
+
+ /// <summary>
+ /// Gets the token stream.
+ /// </summary>
+ /// <value>The token stream.</value>
+ protected internal TokenStream TokenStream
+ {
+ get { return this.tokenStream; }
+ }
+
+ /// <summary>
+ /// Resets the components with the specified <paramref name="reader"/>.
+ /// </summary>
+ /// <param name="reader">The reader.</param>
+ /// <returns><c>true</c> if the internal components where reset, otherwise <c>false</c>.</returns>
+ /// <exception cref="IOException">
+ /// Thrown when the internal <see cref="Tokenizer"/> throws an
+ /// <see cref="IOException"/>/
+ /// </exception>
+ protected internal bool Reset(StreamReader reader)
+ {
+ this.tokenizer.Reset(reader);
+ return true;
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs?rev=1165386&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs Mon Sep 5 18:54:42 2011
@@ -0,0 +1,121 @@
+// -----------------------------------------------------------------------
+// <copyright file="Tokenizer.cs" company="Microsoft">
+// TODO: Update copyright text.
+// </copyright>
+// -----------------------------------------------------------------------
+
+namespace Lucene.Net.Analysis
+{
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+ using System.Linq;
+ using System.Text;
+ using Lucene.Net.Util;
+
+ /// <summary>
+ /// The abstract class which will perform an lexical analysis that will transform sequence of
+ /// characters into a sequence of tokens.
+ /// </summary>
+ /// <remarks>
+ /// <para>
+ /// The abstract <c>Tokenizer</c> class in Lucene.Net is essentially a <see cref="TokenStream"/>
+ /// that has an internal <see cref="StreamReader"/>.
+ /// </para>
+ /// <note>
+ /// Subclasses must override <see cref="TokenStream.IncrementToken"/> and
+ /// <see cref="TokenStream.IncrementToken"/> must call <see cref="AttributeSource.ClearAttributes"/>
+ /// before setting attributes.
+ /// </note>
+ /// </remarks>
+ public abstract class Tokenizer : TokenStream
+ {
+ /// <summary>
+ /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+ /// </summary>
+ protected Tokenizer()
+ {
+ }
+
+ /// <summary>
+ /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+ /// </summary>
+ /// <param name="reader">The reader.</param>
+ protected Tokenizer(StreamReader reader)
+ {
+ this.Reader = CharReader.CastOrCreate(reader);
+ }
+
+ /// <summary>
+ /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+ /// </summary>
+ /// <param name="factory">The factory.</param>
+ protected Tokenizer(AttributeFactory factory)
+ : base(factory)
+ {
+ }
+
+ /// <summary>
+ /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+ /// </summary>
+ /// <param name="factory">The factory.</param>
+ /// <param name="reader">The reader.</param>
+ protected Tokenizer(AttributeFactory factory, StreamReader reader)
+ : base(factory)
+ {
+ this.Reader = CharReader.CastOrCreate(reader);
+ }
+
+ /// <summary>
+ /// Gets or sets the reader.
+ /// </summary>
+ /// <value>The reader.</value>
+ protected TextReader Reader { get; set; }
+
+ /// <summary>
+ /// Resets the specified reader.
+ /// </summary>
+ /// <param name="reader">The reader.</param>
+ public void Reset(StreamReader reader)
+ {
+ this.Reader = reader;
+ }
+
+ /// <summary>
+ /// Corrects and returns the corrected offset.
+ /// </summary>
+ /// <remarks>
+ /// <para>
+ /// If the <see cref="Reader"/> of the Tokenizer is an instance of <see cref="CharStream"/>
+ /// then this method will call <see cref="CharStream.CorrectOffset"/> otherwise it will
+ /// return the value of the parameter <paramref name="offset"/>.
+ /// </para>
+ /// </remarks>
+ /// <param name="offset">The offset as seen in the output.</param>
+ /// <returns>The corrected offset based on the input.</returns>
+ protected int CorrectOffset(int offset)
+ {
+ var charStream = this.Reader as CharStream;
+
+ if (charStream != null)
+ return charStream.CorrectOffset(offset);
+
+ return offset;
+ }
+
+ /// <summary>
+ /// Releases unmanaged and - optionally - managed resources
+ /// </summary>
+ /// <param name="release"><c>true</c> to release both managed and unmanaged resources; <c>false</c> to release only unmanaged resources.</param>
+ protected override void Dispose(bool release)
+ {
+ //// LUCENE-2387: don't hold onto Reader after close, so
+ //// GC can reclaim
+ if (this.Reader != null)
+ {
+ this.Reader.Dispose();
+ this.Reader = null;
+ }
+ }
+ }
+}
Modified: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj?rev=1165386&r1=1165385&r2=1165386&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj (original)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj Mon Sep 5 18:54:42 2011
@@ -54,6 +54,7 @@
<Compile Include="Analysis\Analyzer.cs" />
<Compile Include="Analysis\CharReader.cs" />
<Compile Include="Analysis\CharStream.cs" />
+ <Compile Include="Analysis\ReusableAnalyzerBase.cs" />
<Compile Include="Analysis\Token.cs" />
<Compile Include="Analysis\TokenAttributes\CharTermAttribute.cs" />
<Compile Include="Analysis\TokenAttributes\ICharTermAttribute.cs" />