You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2011/09/05 20:54:42 UTC

[Lucene.Net] svn commit: r1165386 - in /incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net: Analysis/Analyzer.cs Analysis/ReusableAnalyzerBase.cs Analysis/Tokenizer.cs Lucene.Net.csproj

Author: mherndon
Date: Mon Sep  5 18:54:42 2011
New Revision: 1165386

URL: http://svn.apache.org/viewvc?rev=1165386&view=rev
Log:
adding the first passes of ResuableAnalyzerBase & Tokenizer classes.  will add tests once I have time for implementing some of the analzyers from the contrib/module projects

Added:
    incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs
    incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs
Modified:
    incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs
    incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj

Modified: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs?rev=1165386&r1=1165385&r2=1165386&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Analyzer.cs Mon Sep  5 18:54:42 2011
@@ -32,7 +32,7 @@ namespace Lucene.Net.Analysis
     /// </summary>
     public abstract class Analyzer : IDisposable
     {
-        private ThreadLocal<TokenStream> threadLocalTokenStream;
+        private ThreadLocal<object> threadLocalTokenStream;
         private bool disposed = false;
 
         /// <summary>
@@ -59,13 +59,22 @@ namespace Lucene.Net.Analysis
 
 
         /// <summary>
-        /// Gets or sets the previous token stream.
+        /// Gets or sets the previous token stream or token stream storage object.
         /// </summary>
+        /// <remarks>
+        ///     <para>
+        ///     This can be used to store the previous token stream directly or it can 
+        ///     use a custom storage mechanism like <see cref="ReusableAnalyzerBase.TokenStreamComponents"/>.
+        ///     </para>
+        ///     <para>
+        ///     The property name deviates from the Java version because the name is misleading. 
+        ///     </para>
+        /// </remarks>
         /// <value>The previous token stream. Returns null if the value has not been set.</value>
         /// <exception cref="ObjectDisposedException">
         ///     Thrown when <see cref="Analyzer"/> is already disposed.
         /// </exception>
-        protected TokenStream PreviousTokenStream
+        protected object PreviousTokenStreamOrStorage
         {
             get
             {
@@ -109,24 +118,38 @@ namespace Lucene.Net.Analysis
         }
 
         /// <summary>
-        /// Tokens the stream.
+        /// Creates a <see cref="TokenStream"/> using the specified <see cref="StreamReader"/>.
         /// </summary>
+        /// <remarks>
+        ///     <para>
+        ///     Subclasses that implement this method should always be able to handle null 
+        ///     values for the field name for backwards compatibility.
+        ///     </para>
+        /// </remarks>
         /// <param name="fieldName">Name of the field.</param>
         /// <param name="reader">The reader.</param>
         /// <returns>
         /// An instance of <see cref="TokenStream"/>.
         /// </returns>
-        public abstract TokenStream TokenStream(string fieldName, TextReader reader);
+        public abstract TokenStream TokenStream(string fieldName, StreamReader reader);
 
         /// <summary>
-        /// Reusable the token stream.
+        /// Finds or creates a <see cref="TokenStream"/> that is permits the <see cref="TokenStream"/>
+        /// to be re-used on the same thread.
         /// </summary>
+        /// <remarks>
+        ///     <para>
+        ///     Any Class that manages the current <see cref="Analyzer"/> and does not need to use more
+        ///     than one <see cref="TokenStream"/> at the same time should use this method for 
+        ///     better performance. 
+        ///     </para>
+        /// </remarks>
         /// <param name="fieldName">Name of the field.</param>
         /// <param name="reader">The reader.</param>
         /// <returns>
         /// An instance of <see cref="TokenStream"/>.
         /// </returns>
-        public TokenStream ReusableTokenStream(string fieldName, TextReader reader)
+        public virtual TokenStream ReusableTokenStream(string fieldName, StreamReader reader)
         {
             return this.TokenStream(fieldName, reader);
         }

Added: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs?rev=1165386&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/ReusableAnalyzerBase.cs Mon Sep  5 18:54:42 2011
@@ -0,0 +1,179 @@
+// -----------------------------------------------------------------------
+// <copyright company="Apache" file="ReusableAnalyzerBase.cs">
+//
+//      Licensed to the Apache Software Foundation (ASF) under one or more
+//      contributor license agreements.  See the NOTICE file distributed with
+//      this work for additional information regarding copyright ownership.
+//      The ASF licenses this file to You under the Apache License, Version 2.0
+//      (the "License"); you may not use this file except in compliance with
+//      the License.  You may obtain a copy of the License at
+// 
+//      http://www.apache.org/licenses/LICENSE-2.0
+// 
+//      Unless required by applicable law or agreed to in writing, software
+//      distributed under the License is distributed on an "AS IS" BASIS,
+//      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//      See the License for the specific language governing permissions and
+//      limitations under the License.
+//
+// </copyright>
+// -----------------------------------------------------------------------
+
+namespace Lucene.Net.Analysis
+{
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+    using System.Linq;
+    using System.Text;
+
+    /// <summary>
+    /// A subclass of <see cref="Analyzer"/> for the purpose of making it easier to implement
+    /// <see cref="TokenStream"/> re-use.
+    /// </summary>
+    /// <remarks>
+    ///     <para>
+    ///         <see cref="ReusableAnalyzerBase"/> is meant to support easy re-use of <see cref="TokenStream"/>
+    ///         for the most common use-cases.  Analyzers like <c>PerFieldAnalyzerWraper</c> that behave
+    ///         differently depending upon the field name may need to subclass <see cref="Analyzer"/>
+    ///         directly. 
+    ///     </para>
+    ///     <note>Subclasses must implement <see cref="CreateComponents(string, StreamReader)"/>.</note>
+    ///     <para>
+    ///         For consistency, this class does not allow subclasses to extend 
+    ///         <see cref="ReusableTokenStream(string, StreamReader)"/> or <see cref="TokenStream(string, StreamReader)"/>
+    ///         directly. 
+    ///     </para>
+    /// </remarks>
+    public abstract class ReusableAnalyzerBase : Analyzer
+    {
+        /// <summary>
+        /// Finds or creates a <see cref="TokenStream"/> that is permits the <see cref="TokenStream"/>
+        /// to be re-used on the same thread.
+        /// </summary>
+        /// <remarks>
+        ///     <para>
+        ///         The overridden behavior of this method is to check <see cref="Analyzer.PreviousTokenStreamOrStorage"/>
+        ///         to see if a <see cref="TokenStreamComponents"/> object is already stored there. If not, it creates
+        ///         a new instance of <see cref="TokenStreamComponents"/> and stores it in <see cref="Analyzer.PreviousTokenStreamOrStorage"/>.
+        ///         The <see cref="TokenStream" /> held inside the current the <see cref="TokenStreamComponents"/> 
+        ///         instance is then returned.
+        ///     </para>
+        /// </remarks>
+        /// <param name="fieldName">Name of the field.</param>
+        /// <param name="reader">The reader.</param>
+        /// <returns>
+        /// An instance of <see cref="TokenStream"/>.
+        /// </returns>
+        public sealed override TokenStream ReusableTokenStream(string fieldName, StreamReader reader)
+        {
+            var components = this.PreviousTokenStreamOrStorage as TokenStreamComponents;
+            var initializedReader = this.InitializeReader(reader);
+            
+            if (components == null || !components.Reset(initializedReader))
+            {
+                components = this.CreateComponents(fieldName, initializedReader);
+                this.PreviousTokenStreamOrStorage = components;
+            }
+
+            return components.TokenStream;
+        }
+
+        /// <summary>
+        /// Creates a <see cref="TokenStream"/> using the specified <see cref="StreamReader"/>.
+        /// </summary>
+        /// <param name="fieldName">Name of the field.</param>
+        /// <param name="reader">The reader.</param>
+        /// <returns>
+        /// An instance of <see cref="TokenStream"/>.
+        /// </returns>
+        /// <remarks>
+        /// Subclasses that implement this method should always be able to handle null
+        /// values for the field name for backwards compatibility.
+        /// </remarks>
+        public sealed override TokenStream TokenStream(string fieldName, StreamReader reader)
+        {
+            var initializedReader = this.InitializeReader(reader);
+            var components = this.CreateComponents(fieldName, initializedReader);
+
+            return components.TokenStream;
+        }
+
+        /// <summary>
+        /// Creates a new instance of <see cref="TokenStreamComponents"/>.
+        /// </summary>
+        /// <param name="fieldName">Name of the file.</param>
+        /// <param name="reader">The reader.</param>
+        /// <returns>
+        /// An instance of <see cref="TokenStreamComponents"/>.
+        /// </returns>
+        protected abstract TokenStreamComponents CreateComponents(string fieldName, StreamReader reader);
+
+        /// <summary>
+        /// Initializes the <paramref name="reader"/>.
+        /// </summary>
+        /// <param name="reader">The reader.</param>
+        /// <returns>
+        /// An instance of <see cref="StreamReader"/>.
+        /// </returns>
+        protected virtual StreamReader InitializeReader(StreamReader reader)
+        {
+            return reader;
+        }
+
+        /// <summary>
+        /// The components of a <see cref="TokenStream"/>. This class
+        /// provides access to the <see cref="Tokenizer"/> source and the outer end.
+        /// The outer end is instance of <c>TokenFilter</c> which is also a <see cref="TokenStream"/>.
+        /// </summary>
+        protected internal class TokenStreamComponents
+        {
+            private readonly Tokenizer tokenizer;
+            private readonly TokenStream tokenStream;
+
+            /// <summary>
+            /// Initializes a new instance of the <see cref="TokenStreamComponents"/> class.
+            /// </summary>
+            /// <param name="tokenizer">The tokenizer.</param>
+            public TokenStreamComponents(Tokenizer tokenizer)
+                : this(tokenizer, tokenizer)
+            {
+            }
+
+            /// <summary>
+            /// Initializes a new instance of the <see cref="TokenStreamComponents"/> class.
+            /// </summary>
+            /// <param name="tokenizer">The tokenizer.</param>
+            /// <param name="tokenStream">The token stream.</param>
+            public TokenStreamComponents(Tokenizer tokenizer, TokenStream tokenStream)
+            {
+                this.tokenizer = tokenizer;
+                this.tokenStream = tokenStream;
+            }
+
+            /// <summary>
+            /// Gets the token stream.
+            /// </summary>
+            /// <value>The token stream.</value>
+            protected internal TokenStream TokenStream
+            {
+                get { return this.tokenStream; }
+            }
+
+            /// <summary>
+            /// Resets the components with the specified <paramref name="reader"/>.
+            /// </summary>
+            /// <param name="reader">The reader.</param>
+            /// <returns><c>true</c> if the internal components where reset, otherwise <c>false</c>.</returns>
+            /// <exception cref="IOException">
+            ///     Thrown when the internal <see cref="Tokenizer"/> throws an
+            ///     <see cref="IOException"/>/
+            /// </exception>
+            protected internal bool Reset(StreamReader reader)
+            {
+                this.tokenizer.Reset(reader);
+                return true;
+            }
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs?rev=1165386&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Analysis/Tokenizer.cs Mon Sep  5 18:54:42 2011
@@ -0,0 +1,121 @@
+// -----------------------------------------------------------------------
+// <copyright file="Tokenizer.cs" company="Microsoft">
+// TODO: Update copyright text.
+// </copyright>
+// -----------------------------------------------------------------------
+
+namespace Lucene.Net.Analysis
+{
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+    using System.Linq;
+    using System.Text;
+    using Lucene.Net.Util;
+
+    /// <summary>
+    /// The abstract class which will perform an lexical analysis that will transform sequence of 
+    /// characters into a sequence of tokens.
+    /// </summary>
+    /// <remarks>
+    ///     <para>
+    ///         The abstract <c>Tokenizer</c> class in Lucene.Net is essentially a <see cref="TokenStream"/>
+    ///         that has an internal <see cref="StreamReader"/>.
+    ///     </para>
+    ///     <note>
+    ///         Subclasses must override <see cref="TokenStream.IncrementToken"/> and 
+    ///         <see cref="TokenStream.IncrementToken"/> must call <see cref="AttributeSource.ClearAttributes"/>
+    ///         before setting attributes.
+    ///     </note>
+    /// </remarks>
+    public abstract class Tokenizer : TokenStream
+    {
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+        /// </summary>
+        protected Tokenizer()
+        {
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+        /// </summary>
+        /// <param name="reader">The reader.</param>
+        protected Tokenizer(StreamReader reader)
+        {
+            this.Reader = CharReader.CastOrCreate(reader);
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+        /// </summary>
+        /// <param name="factory">The factory.</param>
+        protected Tokenizer(AttributeFactory factory)
+            : base(factory)
+        {
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Tokenizer"/> class.
+        /// </summary>
+        /// <param name="factory">The factory.</param>
+        /// <param name="reader">The reader.</param>
+        protected Tokenizer(AttributeFactory factory, StreamReader reader)
+            : base(factory)
+        {
+            this.Reader = CharReader.CastOrCreate(reader);
+        }
+
+        /// <summary>
+        /// Gets or sets the reader.
+        /// </summary>
+        /// <value>The reader.</value>
+        protected TextReader Reader { get; set; }
+
+        /// <summary>
+        /// Resets the specified reader.
+        /// </summary>
+        /// <param name="reader">The reader.</param>
+        public void Reset(StreamReader reader)
+        {
+            this.Reader = reader;
+        }
+
+        /// <summary>
+        /// Corrects and returns the corrected offset.
+        /// </summary>
+        /// <remarks>
+        ///     <para>
+        ///     If the <see cref="Reader"/> of the Tokenizer is an instance of <see cref="CharStream"/>
+        ///     then this method will call <see cref="CharStream.CorrectOffset"/> otherwise it will
+        ///     return the value of the parameter <paramref name="offset"/>.
+        ///     </para>
+        /// </remarks>
+        /// <param name="offset">The offset as seen in the output.</param>
+        /// <returns>The corrected offset based on the input.</returns>
+        protected int CorrectOffset(int offset)
+        {
+            var charStream = this.Reader as CharStream;
+
+            if (charStream != null)
+                return charStream.CorrectOffset(offset);
+
+            return offset;
+        }
+
+        /// <summary>
+        /// Releases unmanaged and - optionally - managed resources
+        /// </summary>
+        /// <param name="release"><c>true</c> to release both managed and unmanaged resources; <c>false</c> to release only unmanaged resources.</param>
+        protected override void Dispose(bool release)
+        {
+            //// LUCENE-2387: don't hold onto Reader after close, so
+            //// GC can reclaim
+            if (this.Reader != null)
+            {
+                this.Reader.Dispose();
+                this.Reader = null;
+            }
+        }
+    }
+}

Modified: incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj?rev=1165386&r1=1165385&r2=1165386&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj (original)
+++ incubator/lucene.net/branches/Lucene.Net_4e/src/Lucene.Net/Lucene.Net.csproj Mon Sep  5 18:54:42 2011
@@ -54,6 +54,7 @@
     <Compile Include="Analysis\Analyzer.cs" />
     <Compile Include="Analysis\CharReader.cs" />
     <Compile Include="Analysis\CharStream.cs" />
+    <Compile Include="Analysis\ReusableAnalyzerBase.cs" />
     <Compile Include="Analysis\Token.cs" />
     <Compile Include="Analysis\TokenAttributes\CharTermAttribute.cs" />
     <Compile Include="Analysis\TokenAttributes\ICharTermAttribute.cs" />