You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/12/22 14:10:01 UTC

[1/3] lucenenet git commit: port of lucene-solr/lucene/classification w/o tests

Repository: lucenenet
Updated Branches:
  refs/heads/master 2d7533d4e -> c0c101953


port of lucene-solr/lucene/classification w/o tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/aba955ce
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/aba955ce
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/aba955ce

Branch: refs/heads/master
Commit: aba955ce29c81a2e943246acb1df7940fe3d7483
Parents: 2d7533d
Author: Laimonas Simutis <la...@gmail.com>
Authored: Sat Dec 6 22:23:30 2014 -0500
Committer: Laimonas Simutis <la...@gmail.com>
Committed: Sat Dec 6 22:23:30 2014 -0500

----------------------------------------------------------------------
 .../ClassificationResult.cs                     |  64 ++++++
 src/Lucene.Net.Classification/Classifier.cs     |  65 ++++++
 .../KNearesteighborClassifier.cs                | 150 ++++++++++++++
 .../Lucene.Net.Classification.csproj            |  66 ++++++
 .../Lucene.Net.Classification.sln               |  54 +++++
 .../Properties/AssemblyInfo.cs                  |  36 ++++
 .../SimpleNaiveBayesClassifier.cs               | 205 +++++++++++++++++++
 7 files changed, 640 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/ClassificationResult.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/ClassificationResult.cs b/src/Lucene.Net.Classification/ClassificationResult.cs
new file mode 100644
index 0000000..356ec7e
--- /dev/null
+++ b/src/Lucene.Net.Classification/ClassificationResult.cs
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    /// <summary>
+    /// The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type <code>T</code> and a score.
+    /// @lucene.experimental
+    /// </summary>
+    public class ClassificationResult<T> {
+
+        private readonly T _assignedClass;
+        private readonly double _score;
+
+        /// <summary>
+        /// Constructor
+        /// <param name="assignedClass">assignedClass the class <code>T</code> assigned by a {@link Classifier}</param>
+        /// <param name="score">score the score for the assignedClass as a <code>double</code></param>
+        /// </summary>
+        public ClassificationResult(T assignedClass, double score) 
+        {
+            this._assignedClass = assignedClass;
+            this._score = score;
+        }
+
+        /// <summary>
+        /// retrieve the result class
+        /// @return a <code>T</code> representing an assigned class
+        /// </summary>
+        public T AssignedClass 
+        {
+            get
+            {
+                return _assignedClass;
+            }
+        }
+
+        /// <summary>
+        /// retrieve the result score
+        /// @return a <code>double</code> representing a result score
+        /// </summary>
+        public double Score
+        {
+            get
+            {
+                return _score;
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Classifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Classifier.cs b/src/Lucene.Net.Classification/Classifier.cs
new file mode 100644
index 0000000..6ffca79
--- /dev/null
+++ b/src/Lucene.Net.Classification/Classifier.cs
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Index;
+    using Lucene.Net.Search;
+    using System;
+
+    /// <summary>
+    /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
+    /// <code>T</code>
+    /// @lucene.experimental
+    /// </summary>
+    public interface Classifier<T> 
+    {
+        /// <summary>
+        /// Assign a class (with score) to the given text String
+        /// </summary>
+        /// <param name="text">a String containing text to be classified</param>
+        /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code> and score</returns>
+        ClassificationResult<T> AssignClass(String text);
+
+        /// <summary>
+        /// * Train the classifier using the underlying Lucene index
+        /// </summary>
+        /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+        /// <param name="textFieldName">the name of the field used to compare documents</param>
+        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer);
+
+        /// <summary>Train the classifier using the underlying Lucene index</summary>
+        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+        /// <param name="query">the query to filter which documents use for training</param>
+        /// <param name="textFieldName">the name of the field used to compare documents</param>
+        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query);
+
+        /// <summary>Train the classifier using the underlying Lucene index</summary>
+        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+        /// <param name="query">the query to filter which documents use for training</param>
+        /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
+        void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer,
+                   Query query);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
new file mode 100644
index 0000000..c83301e
--- /dev/null
+++ b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Index;
+    using Lucene.Net.Queries.Mlt;
+    using Lucene.Net.Search;
+    using Lucene.Net.Util;
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+
+    /// <summary>
+    /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
+    /// on {@link MoreLikeThis}
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public class KNearestNeighborClassifier : Classifier<BytesRef> 
+    {
+
+        private MoreLikeThis _mlt;
+        private String[] _textFieldNames;
+        private String _classFieldName;
+        private IndexSearcher _indexSearcher;
+        private readonly int _k;
+        private Query _query;
+
+        private int _minDocsFreq;
+        private int _minTermFreq;
+
+        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+        public KNearestNeighborClassifier(int k) 
+        {
+        this._k = k;
+        }
+
+        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+        /// <param name="minDocsFreq">the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
+        /// <param name="minTermFreq">the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
+        public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq) 
+        {
+        this._k = k;
+        this._minDocsFreq = minDocsFreq;
+        this._minTermFreq = minTermFreq;
+        }
+
+        public ClassificationResult<BytesRef> AssignClass(String text)
+        {
+            if (_mlt == null) 
+            {
+                throw new IOException("You must first call Classifier#train");
+            }
+
+            BooleanQuery mltQuery = new BooleanQuery();
+            foreach (String textFieldName in _textFieldNames) 
+            {
+                mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
+            }
+            Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
+            mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
+            if (_query != null) {
+                mltQuery.Add(_query, BooleanClause.Occur.MUST);
+            }
+            TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
+            return SelectClassFromNeighbors(topDocs);
+        }
+
+        private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs) 
+        {
+            // TODO : improve the nearest neighbor selection
+            Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
+
+            foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs) 
+            {
+                BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
+                int count = classCounts[cl];
+                if (classCounts.ContainsKey(cl))
+                {
+                    classCounts[cl] = count + 1;
+                } 
+                else 
+                {
+                    classCounts.Add(cl, 1);
+                }
+            }
+            double max = 0;
+            BytesRef assignedClass = new BytesRef();
+            foreach (KeyValuePair<BytesRef, int> entry in classCounts) 
+            {
+                int count = entry.Value;
+                if (count > max) 
+                {
+                    max = count;
+                    assignedClass = (BytesRef)entry.Key.Clone();
+                }
+            }
+            double score = max / (double) _k;
+            return new ClassificationResult<BytesRef>(assignedClass, score);
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) 
+        {
+            Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+        }
+
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) 
+        {
+            Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+        }
+
+        public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
+        {
+            this._textFieldNames = textFieldNames;
+            this._classFieldName = classFieldName;
+            _mlt = new MoreLikeThis(atomicReader);
+            _mlt.Analyzer = analyzer;
+            _mlt.FieldNames = _textFieldNames;
+            _indexSearcher = new IndexSearcher(atomicReader);
+            if (_minDocsFreq > 0) 
+            {
+                _mlt.MinDocFreq = _minDocsFreq;
+            }
+            if (_minTermFreq > 0) 
+            {
+                _mlt.MinTermFreq = _minTermFreq;
+            }
+            this._query = query;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
new file mode 100644
index 0000000..e0bf2e9
--- /dev/null
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Classification</RootNamespace>
+    <AssemblyName>Lucene.Net.Classification</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="ClassificationResult.cs" />
+    <Compile Include="Classifier.cs" />
+    <Compile Include="KNearesteighborClassifier.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="SimpleNaiveBayesClassifier.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
+      <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net.Queries\Lucene.Net.Queries.csproj">
+      <Project>{69D7956C-C2CC-4708-B399-A188FEC384C4}</Project>
+      <Name>Lucene.Net.Queries</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.sln b/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
new file mode 100644
index 0000000..9965049
--- /dev/null
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
@@ -0,0 +1,54 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Classification", "Lucene.Net.Classification.csproj", "{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\Lucene.Net.Core\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Queries", "..\Lucene.Net.Queries\Lucene.Net.Queries.csproj", "{69D7956C-C2CC-4708-B399-A188FEC384C4}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|Mixed Platforms = Debug|Mixed Platforms
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|Mixed Platforms = Release|Mixed Platforms
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Any CPU.Build.0 = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|x86.ActiveCfg = Release|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Mixed Platforms.ActiveCfg = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Mixed Platforms.Build.0 = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|x86.ActiveCfg = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|x86.Build.0 = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Mixed Platforms.ActiveCfg = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Mixed Platforms.Build.0 = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|x86.ActiveCfg = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|x86.Build.0 = Release|x86
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Any CPU.Build.0 = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|x86.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs b/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..ede45e1
--- /dev/null
+++ b/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Classification")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Lucene.Net.Classification")]
+[assembly: AssemblyCopyright("Copyright ©  2014")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("ff6180c7-579d-4557-bf6a-ddd139fad2e4")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
new file mode 100644
index 0000000..0980d58
--- /dev/null
+++ b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Analysis.Tokenattributes;
+    using Lucene.Net.Index;
+    using Lucene.Net.Search;
+    using Lucene.Net.Util;
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+
+    /// <summary>
+    /// A simplistic Lucene based NaiveBayes classifier, see <code>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</code>
+    ///
+    /// @lucene.experimental
+    /// </summary>
+    public class SimpleNaiveBayesClassifier : Classifier<BytesRef> 
+    {
+        private AtomicReader _atomicReader;
+        private String[] _textFieldNames;
+        private String _classFieldName;
+        private int _docsWithClassSize;
+        private Analyzer _analyzer;
+        private IndexSearcher _indexSearcher;
+        private Query _query;
+
+        public SimpleNaiveBayesClassifier()
+        {      
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) 
+        {
+            Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
+        {
+            Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+        }
+
+        public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
+        {
+            this._atomicReader = atomicReader;
+            this._indexSearcher = new IndexSearcher(this._atomicReader);
+            this._textFieldNames = textFieldNames;
+            this._classFieldName = classFieldName;
+            this._analyzer = analyzer;
+            this._query = query;
+            this._docsWithClassSize = CountDocsWithClass();
+        }
+
+        private int CountDocsWithClass() 
+        {
+            int docCount = MultiFields.GetTerms(this._atomicReader, this._classFieldName).DocCount;
+            if (docCount == -1) 
+            { // in case codec doesn't support getDocCount
+                TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+                BooleanQuery q = new BooleanQuery();
+                q.Add(new BooleanClause(new WildcardQuery(new Term(_classFieldName, WildcardQuery.WILDCARD_STRING.ToString())), BooleanClause.Occur.MUST));
+                if (_query != null) 
+                {
+                    q.Add(_query, BooleanClause.Occur.MUST);
+                }
+                _indexSearcher.Search(q, totalHitCountCollector);
+                docCount = totalHitCountCollector.TotalHits;
+            }
+            return docCount;
+        }
+
+        private String[] TokenizeDoc(String doc)
+        {
+            ICollection<String> result = new LinkedList<string>();
+            foreach (String textFieldName in _textFieldNames) {
+                TokenStream tokenStream = _analyzer.TokenStream(textFieldName, new StringReader(doc));
+                try 
+                {
+                    CharTermAttribute charTermAttribute = tokenStream.AddAttribute<CharTermAttribute>();
+                    tokenStream.Reset();
+                    while (tokenStream.IncrementToken()) 
+                    {
+                        result.Add(charTermAttribute.ToString());
+                    }
+                    tokenStream.End();
+                } 
+                finally 
+                {
+                    IOUtils.CloseWhileHandlingException(tokenStream);
+                }
+            }
+            var ret = new string[result.Count];
+            result.CopyTo(ret, 0);
+            return ret;
+        }
+
+        public ClassificationResult<BytesRef> AssignClass(String inputDocument) 
+        {
+            if (_atomicReader == null) 
+            {
+                throw new IOException("You must first call Classifier#train");
+            }
+            double max = - Double.MaxValue;
+            BytesRef foundClass = new BytesRef();
+
+            Terms terms = MultiFields.GetTerms(_atomicReader, _classFieldName);
+            TermsEnum termsEnum = terms.Iterator(null);
+            BytesRef next;
+            String[] tokenizedDoc = TokenizeDoc(inputDocument);
+            while ((next = termsEnum.Next()) != null) 
+            {
+                double clVal = CalculateLogPrior(next) + CalculateLogLikelihood(tokenizedDoc, next);
+                if (clVal > max) 
+                {
+                    max = clVal;
+                    foundClass = BytesRef.DeepCopyOf(next);
+                }
+            }
+            double score = 10 / Math.Abs(max);
+            return new ClassificationResult<BytesRef>(foundClass, score);
+        }
+
+
+        private double CalculateLogLikelihood(String[] tokenizedDoc, BytesRef c)
+        {
+            // for each word
+            double result = 0d;
+            foreach (String word in tokenizedDoc) 
+            {
+                // search with text:word AND class:c
+                int hits = GetWordFreqForClass(word, c);
+
+                // num : count the no of times the word appears in documents of class c (+1)
+                double num = hits + 1; // +1 is added because of add 1 smoothing
+
+                // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+                double den = GetTextTermFreqForClass(c) + _docsWithClassSize;
+
+                // P(w|c) = num/den
+                double wordProbability = num / den;
+                result += Math.Log(wordProbability);
+            }
+
+            // log(P(d|c)) = log(P(w1|c))+...+log(P(wn|c))
+            return result;
+        }
+
+        private double GetTextTermFreqForClass(BytesRef c)
+        {
+            double avgNumberOfUniqueTerms = 0;
+            foreach (String textFieldName in _textFieldNames) 
+            {
+                Terms terms = MultiFields.GetTerms(_atomicReader, textFieldName);
+                long numPostings = terms.SumDocFreq; // number of term/doc pairs
+                avgNumberOfUniqueTerms += numPostings / (double) terms.DocCount; // avg # of unique terms per doc
+            }
+            int docsWithC = _atomicReader.DocFreq(new Term(_classFieldName, c));
+            return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c
+        }
+
+        private int GetWordFreqForClass(String word, BytesRef c)
+        {
+            BooleanQuery booleanQuery = new BooleanQuery();
+            BooleanQuery subQuery = new BooleanQuery();
+            foreach (String textFieldName in _textFieldNames) 
+            {
+                subQuery.Add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD));
+            }
+            booleanQuery.Add(new BooleanClause(subQuery, BooleanClause.Occur.MUST));
+            booleanQuery.Add(new BooleanClause(new TermQuery(new Term(_classFieldName, c)), BooleanClause.Occur.MUST));
+            if (_query != null) 
+            {
+                booleanQuery.Add(_query, BooleanClause.Occur.MUST);
+            }
+            TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+            _indexSearcher.Search(booleanQuery, totalHitCountCollector);
+            return totalHitCountCollector.TotalHits;
+        }
+
+        private double CalculateLogPrior(BytesRef currentClass)
+        {
+            return Math.Log((double) DocCount(currentClass)) - Math.Log(_docsWithClassSize);
+        }
+
+        private int DocCount(BytesRef countedClass) 
+        {
+            return _atomicReader.DocFreq(new Term(_classFieldName, countedClass));
+        }
+    }   
+}
\ No newline at end of file


[2/3] lucenenet git commit: fixes based on feedback (rename classes, fix indents, remove 'this.')

Posted by sy...@apache.org.
fixes based on feedback (rename classes, fix indents, remove 'this.')


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/583627a1
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/583627a1
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/583627a1

Branch: refs/heads/master
Commit: 583627a12799609a4fd1ceed5487805ebd7dc8b6
Parents: aba955c
Author: Laimonas Simutis <la...@gmail.com>
Authored: Sun Dec 7 07:41:02 2014 -0500
Committer: Laimonas Simutis <la...@gmail.com>
Committed: Sun Dec 7 07:41:02 2014 -0500

----------------------------------------------------------------------
 .../ClassificationResult.cs                     |   7 +-
 src/Lucene.Net.Classification/Classifier.cs     |  65 --------
 src/Lucene.Net.Classification/IClassifier.cs    |  65 ++++++++
 .../KNearestNeighborClassifier.cs               | 151 +++++++++++++++++++
 .../KNearesteighborClassifier.cs                | 150 ------------------
 .../Lucene.Net.Classification.csproj            |   4 +-
 .../SimpleNaiveBayesClassifier.cs               |  18 +--
 7 files changed, 231 insertions(+), 229 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/ClassificationResult.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/ClassificationResult.cs b/src/Lucene.Net.Classification/ClassificationResult.cs
index 356ec7e..1db3ed0 100644
--- a/src/Lucene.Net.Classification/ClassificationResult.cs
+++ b/src/Lucene.Net.Classification/ClassificationResult.cs
@@ -21,7 +21,8 @@ namespace Lucene.Net.Classification
     /// The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type <code>T</code> and a score.
     /// @lucene.experimental
     /// </summary>
-    public class ClassificationResult<T> {
+    public class ClassificationResult<T>
+    {
 
         private readonly T _assignedClass;
         private readonly double _score;
@@ -33,8 +34,8 @@ namespace Lucene.Net.Classification
         /// </summary>
         public ClassificationResult(T assignedClass, double score) 
         {
-            this._assignedClass = assignedClass;
-            this._score = score;
+            _assignedClass = assignedClass;
+            _score = score;
         }
 
         /// <summary>

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/Classifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Classifier.cs b/src/Lucene.Net.Classification/Classifier.cs
deleted file mode 100644
index 6ffca79..0000000
--- a/src/Lucene.Net.Classification/Classifier.cs
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace Lucene.Net.Classification
-{
-    using Lucene.Net.Analysis;
-    using Lucene.Net.Index;
-    using Lucene.Net.Search;
-    using System;
-
-    /// <summary>
-    /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
-    /// <code>T</code>
-    /// @lucene.experimental
-    /// </summary>
-    public interface Classifier<T> 
-    {
-        /// <summary>
-        /// Assign a class (with score) to the given text String
-        /// </summary>
-        /// <param name="text">a String containing text to be classified</param>
-        /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code> and score</returns>
-        ClassificationResult<T> AssignClass(String text);
-
-        /// <summary>
-        /// * Train the classifier using the underlying Lucene index
-        /// </summary>
-        /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen text</param>
-        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
-        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
-        /// <param name="textFieldName">the name of the field used to compare documents</param>
-        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer);
-
-        /// <summary>Train the classifier using the underlying Lucene index</summary>
-        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
-        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
-        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
-        /// <param name="query">the query to filter which documents use for training</param>
-        /// <param name="textFieldName">the name of the field used to compare documents</param>
-        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query);
-
-        /// <summary>Train the classifier using the underlying Lucene index</summary>
-        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
-        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
-        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
-        /// <param name="query">the query to filter which documents use for training</param>
-        /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
-        void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer,
-                   Query query);
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/IClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/IClassifier.cs b/src/Lucene.Net.Classification/IClassifier.cs
new file mode 100644
index 0000000..2e05173
--- /dev/null
+++ b/src/Lucene.Net.Classification/IClassifier.cs
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Index;
+    using Lucene.Net.Search;
+    using System;
+
+    /// <summary>
+    /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
+    /// <code>T</code>
+    /// @lucene.experimental
+    /// </summary>
+    public interface IClassifier<T>
+    {
+        /// <summary>
+        /// Assign a class (with score) to the given text String
+        /// </summary>
+        /// <param name="text">a String containing text to be classified</param>
+        /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code> and score</returns>
+        ClassificationResult<T> AssignClass(String text);
+
+        /// <summary>
+        /// * Train the classifier using the underlying Lucene index
+        /// </summary>
+        /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+        /// <param name="textFieldName">the name of the field used to compare documents</param>
+        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer);
+
+        /// <summary>Train the classifier using the underlying Lucene index</summary>
+        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+        /// <param name="query">the query to filter which documents use for training</param>
+        /// <param name="textFieldName">the name of the field used to compare documents</param>
+        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query);
+
+        /// <summary>Train the classifier using the underlying Lucene index</summary>
+        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+        /// <param name="query">the query to filter which documents use for training</param>
+        /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
+        void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer,
+                   Query query);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs b/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs
new file mode 100644
index 0000000..e400254
--- /dev/null
+++ b/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Index;
+    using Lucene.Net.Queries.Mlt;
+    using Lucene.Net.Search;
+    using Lucene.Net.Util;
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+
+    /// <summary>
+    /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
+    /// on {@link MoreLikeThis}
+    ///
+    /// @lucene.experimental
+    /// </summary>
+    public class KNearestNeighborClassifier : IClassifier<BytesRef>
+    {
+
+        private MoreLikeThis _mlt;
+        private String[] _textFieldNames;
+        private String _classFieldName;
+        private IndexSearcher _indexSearcher;
+        private readonly int _k;
+        private Query _query;
+
+        private int _minDocsFreq;
+        private int _minTermFreq;
+
+        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+        public KNearestNeighborClassifier(int k)
+        {
+            _k = k;
+        }
+
+        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+        /// <param name="minDocsFreq">the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
+        /// <param name="minTermFreq">the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
+        public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq)
+        {
+            _k = k;
+            _minDocsFreq = minDocsFreq;
+            _minTermFreq = minTermFreq;
+        }
+
+        public ClassificationResult<BytesRef> AssignClass(String text)
+        {
+            if (_mlt == null)
+            {
+                throw new IOException("You must first call Classifier#train");
+            }
+
+            BooleanQuery mltQuery = new BooleanQuery();
+            foreach (String textFieldName in _textFieldNames)
+            {
+                mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
+            }
+            Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
+            mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
+            if (_query != null)
+            {
+                mltQuery.Add(_query, BooleanClause.Occur.MUST);
+            }
+            TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
+            return SelectClassFromNeighbors(topDocs);
+        }
+
+        private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs)
+        {
+            // TODO : improve the nearest neighbor selection
+            Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
+
+            foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
+            {
+                BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
+                int count = classCounts[cl];
+                if (classCounts.ContainsKey(cl))
+                {
+                    classCounts[cl] = count + 1;
+                }
+                else
+                {
+                    classCounts.Add(cl, 1);
+                }
+            }
+            double max = 0;
+            BytesRef assignedClass = new BytesRef();
+            foreach (KeyValuePair<BytesRef, int> entry in classCounts)
+            {
+                int count = entry.Value;
+                if (count > max)
+                {
+                    max = count;
+                    assignedClass = (BytesRef)entry.Key.Clone();
+                }
+            }
+            double score = max / (double) _k;
+            return new ClassificationResult<BytesRef>(assignedClass, score);
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
+        {
+            Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+        }
+
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
+        {
+            Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+        }
+
+        public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
+        {
+            _textFieldNames = textFieldNames;
+            _classFieldName = classFieldName;
+            _mlt = new MoreLikeThis(atomicReader);
+            _mlt.Analyzer = analyzer;
+            _mlt.FieldNames = _textFieldNames;
+            _indexSearcher = new IndexSearcher(atomicReader);
+            if (_minDocsFreq > 0)
+            {
+                _mlt.MinDocFreq = _minDocsFreq;
+            }
+            if (_minTermFreq > 0)
+            {
+                _mlt.MinTermFreq = _minTermFreq;
+            }
+            _query = query;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
deleted file mode 100644
index c83301e..0000000
--- a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace Lucene.Net.Classification
-{
-    using Lucene.Net.Analysis;
-    using Lucene.Net.Index;
-    using Lucene.Net.Queries.Mlt;
-    using Lucene.Net.Search;
-    using Lucene.Net.Util;
-    using System;
-    using System.Collections.Generic;
-    using System.IO;
-
-    /// <summary>
-    /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
-    /// on {@link MoreLikeThis}
-    /// 
-    /// @lucene.experimental
-    /// </summary>
-    public class KNearestNeighborClassifier : Classifier<BytesRef> 
-    {
-
-        private MoreLikeThis _mlt;
-        private String[] _textFieldNames;
-        private String _classFieldName;
-        private IndexSearcher _indexSearcher;
-        private readonly int _k;
-        private Query _query;
-
-        private int _minDocsFreq;
-        private int _minTermFreq;
-
-        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
-        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
-        public KNearestNeighborClassifier(int k) 
-        {
-        this._k = k;
-        }
-
-        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
-        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
-        /// <param name="minDocsFreq">the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
-        /// <param name="minTermFreq">the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
-        public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq) 
-        {
-        this._k = k;
-        this._minDocsFreq = minDocsFreq;
-        this._minTermFreq = minTermFreq;
-        }
-
-        public ClassificationResult<BytesRef> AssignClass(String text)
-        {
-            if (_mlt == null) 
-            {
-                throw new IOException("You must first call Classifier#train");
-            }
-
-            BooleanQuery mltQuery = new BooleanQuery();
-            foreach (String textFieldName in _textFieldNames) 
-            {
-                mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
-            }
-            Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
-            mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
-            if (_query != null) {
-                mltQuery.Add(_query, BooleanClause.Occur.MUST);
-            }
-            TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
-            return SelectClassFromNeighbors(topDocs);
-        }
-
-        private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs) 
-        {
-            // TODO : improve the nearest neighbor selection
-            Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
-
-            foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs) 
-            {
-                BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
-                int count = classCounts[cl];
-                if (classCounts.ContainsKey(cl))
-                {
-                    classCounts[cl] = count + 1;
-                } 
-                else 
-                {
-                    classCounts.Add(cl, 1);
-                }
-            }
-            double max = 0;
-            BytesRef assignedClass = new BytesRef();
-            foreach (KeyValuePair<BytesRef, int> entry in classCounts) 
-            {
-                int count = entry.Value;
-                if (count > max) 
-                {
-                    max = count;
-                    assignedClass = (BytesRef)entry.Key.Clone();
-                }
-            }
-            double score = max / (double) _k;
-            return new ClassificationResult<BytesRef>(assignedClass, score);
-        }
-
-        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) 
-        {
-            Train(atomicReader, textFieldName, classFieldName, analyzer, null);
-        }
-
-
-        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) 
-        {
-            Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
-        }
-
-        public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
-        {
-            this._textFieldNames = textFieldNames;
-            this._classFieldName = classFieldName;
-            _mlt = new MoreLikeThis(atomicReader);
-            _mlt.Analyzer = analyzer;
-            _mlt.FieldNames = _textFieldNames;
-            _indexSearcher = new IndexSearcher(atomicReader);
-            if (_minDocsFreq > 0) 
-            {
-                _mlt.MinDocFreq = _minDocsFreq;
-            }
-            if (_minTermFreq > 0) 
-            {
-                _mlt.MinTermFreq = _minTermFreq;
-            }
-            this._query = query;
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
index e0bf2e9..8d31ed5 100644
--- a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -40,8 +40,8 @@
   </ItemGroup>
   <ItemGroup>
     <Compile Include="ClassificationResult.cs" />
-    <Compile Include="Classifier.cs" />
-    <Compile Include="KNearesteighborClassifier.cs" />
+    <Compile Include="IClassifier.cs" />
+    <Compile Include="KNearestNeighborClassifier.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
     <Compile Include="SimpleNaiveBayesClassifier.cs" />
   </ItemGroup>

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
index 0980d58..a045c80 100644
--- a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
+++ b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
@@ -31,7 +31,7 @@ namespace Lucene.Net.Classification
     ///
     /// @lucene.experimental
     /// </summary>
-    public class SimpleNaiveBayesClassifier : Classifier<BytesRef> 
+    public class SimpleNaiveBayesClassifier : IClassifier<BytesRef>
     {
         private AtomicReader _atomicReader;
         private String[] _textFieldNames;
@@ -57,18 +57,18 @@ namespace Lucene.Net.Classification
 
         public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
         {
-            this._atomicReader = atomicReader;
-            this._indexSearcher = new IndexSearcher(this._atomicReader);
-            this._textFieldNames = textFieldNames;
-            this._classFieldName = classFieldName;
-            this._analyzer = analyzer;
-            this._query = query;
-            this._docsWithClassSize = CountDocsWithClass();
+            _atomicReader = atomicReader;
+            _indexSearcher = new IndexSearcher(_atomicReader);
+            _textFieldNames = textFieldNames;
+            _classFieldName = classFieldName;
+            _analyzer = analyzer;
+            _query = query;
+            _docsWithClassSize = CountDocsWithClass();
         }
 
         private int CountDocsWithClass() 
         {
-            int docCount = MultiFields.GetTerms(this._atomicReader, this._classFieldName).DocCount;
+            int docCount = MultiFields.GetTerms(_atomicReader, _classFieldName).DocCount;
             if (docCount == -1) 
             { // in case codec doesn't support getDocCount
                 TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();


[3/3] lucenenet git commit: Bring Query.Equals() to be back in par with the Java impl

Posted by sy...@apache.org.
Bring Query.Equals() to be back in par with the Java impl


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/c0c10195
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/c0c10195
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/c0c10195

Branch: refs/heads/master
Commit: c0c101953e14398a8e3f01f185a3ecfcecbc1609
Parents: 583627a
Author: Itamar Syn-Hershko <it...@code972.com>
Authored: Mon Dec 22 15:09:35 2014 +0200
Committer: Itamar Syn-Hershko <it...@code972.com>
Committed: Mon Dec 22 15:09:35 2014 +0200

----------------------------------------------------------------------
 src/Lucene.Net.Core/Search/Query.cs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c0c10195/src/Lucene.Net.Core/Search/Query.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Search/Query.cs b/src/Lucene.Net.Core/Search/Query.cs
index d553af6..1448093 100644
--- a/src/Lucene.Net.Core/Search/Query.cs
+++ b/src/Lucene.Net.Core/Search/Query.cs
@@ -134,12 +134,17 @@ namespace Lucene.Net.Search
                 return true;
             }
 
+            if (GetType() != obj.GetType())
+            {
+                return false;
+            }
+
             var other = obj as Query;
             if (other == null)
             {
                 return false;
             }
-           
+
             if (Number.FloatToIntBits(Boost) != Number.FloatToIntBits(other.Boost))
             {
                 return false;