You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/12/22 14:10:01 UTC
[1/3] lucenenet git commit: port of lucene-solr/lucene/classification
w/o tests
Repository: lucenenet
Updated Branches:
refs/heads/master 2d7533d4e -> c0c101953
port of lucene-solr/lucene/classification w/o tests
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/aba955ce
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/aba955ce
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/aba955ce
Branch: refs/heads/master
Commit: aba955ce29c81a2e943246acb1df7940fe3d7483
Parents: 2d7533d
Author: Laimonas Simutis <la...@gmail.com>
Authored: Sat Dec 6 22:23:30 2014 -0500
Committer: Laimonas Simutis <la...@gmail.com>
Committed: Sat Dec 6 22:23:30 2014 -0500
----------------------------------------------------------------------
.../ClassificationResult.cs | 64 ++++++
src/Lucene.Net.Classification/Classifier.cs | 65 ++++++
.../KNearesteighborClassifier.cs | 150 ++++++++++++++
.../Lucene.Net.Classification.csproj | 66 ++++++
.../Lucene.Net.Classification.sln | 54 +++++
.../Properties/AssemblyInfo.cs | 36 ++++
.../SimpleNaiveBayesClassifier.cs | 205 +++++++++++++++++++
7 files changed, 640 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/ClassificationResult.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/ClassificationResult.cs b/src/Lucene.Net.Classification/ClassificationResult.cs
new file mode 100644
index 0000000..356ec7e
--- /dev/null
+++ b/src/Lucene.Net.Classification/ClassificationResult.cs
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+ /// <summary>
+ /// The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type <code>T</code> and a score.
+ /// @lucene.experimental
+ /// </summary>
+ public class ClassificationResult<T> {
+
+ private readonly T _assignedClass;
+ private readonly double _score;
+
+ /// <summary>
+ /// Constructor
+ /// <param name="assignedClass">assignedClass the class <code>T</code> assigned by a {@link Classifier}</param>
+ /// <param name="score">score the score for the assignedClass as a <code>double</code></param>
+ /// </summary>
+ public ClassificationResult(T assignedClass, double score)
+ {
+ this._assignedClass = assignedClass;
+ this._score = score;
+ }
+
+ /// <summary>
+ /// retrieve the result class
+ /// @return a <code>T</code> representing an assigned class
+ /// </summary>
+ public T AssignedClass
+ {
+ get
+ {
+ return _assignedClass;
+ }
+ }
+
+ /// <summary>
+ /// retrieve the result score
+ /// @return a <code>double</code> representing a result score
+ /// </summary>
+ public double Score
+ {
+ get
+ {
+ return _score;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Classifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Classifier.cs b/src/Lucene.Net.Classification/Classifier.cs
new file mode 100644
index 0000000..6ffca79
--- /dev/null
+++ b/src/Lucene.Net.Classification/Classifier.cs
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+ using Lucene.Net.Analysis;
+ using Lucene.Net.Index;
+ using Lucene.Net.Search;
+ using System;
+
+ /// <summary>
+ /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
+ /// <code>T</code>
+ /// @lucene.experimental
+ /// </summary>
+ public interface Classifier<T>
+ {
+ /// <summary>
+ /// Assign a class (with score) to the given text String
+ /// </summary>
+ /// <param name="text">a String containing text to be classified</param>
+ /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code> and score</returns>
+ ClassificationResult<T> AssignClass(String text);
+
+ /// <summary>
+ /// * Train the classifier using the underlying Lucene index
+ /// </summary>
+ /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen text</param>
+ /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+ /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+ /// <param name="textFieldName">the name of the field used to compare documents</param>
+ void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer);
+
+ /// <summary>Train the classifier using the underlying Lucene index</summary>
+ /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+ /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+ /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+ /// <param name="query">the query to filter which documents use for training</param>
+ /// <param name="textFieldName">the name of the field used to compare documents</param>
+ void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query);
+
+ /// <summary>Train the classifier using the underlying Lucene index</summary>
+ /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+ /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+ /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+ /// <param name="query">the query to filter which documents use for training</param>
+ /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
+ void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer,
+ Query query);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
new file mode 100644
index 0000000..c83301e
--- /dev/null
+++ b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+ using Lucene.Net.Analysis;
+ using Lucene.Net.Index;
+ using Lucene.Net.Queries.Mlt;
+ using Lucene.Net.Search;
+ using Lucene.Net.Util;
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+
+ /// <summary>
+ /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
+ /// on {@link MoreLikeThis}
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ public class KNearestNeighborClassifier : Classifier<BytesRef>
+ {
+
+ private MoreLikeThis _mlt;
+ private String[] _textFieldNames;
+ private String _classFieldName;
+ private IndexSearcher _indexSearcher;
+ private readonly int _k;
+ private Query _query;
+
+ private int _minDocsFreq;
+ private int _minTermFreq;
+
+ /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+ /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+ public KNearestNeighborClassifier(int k)
+ {
+ this._k = k;
+ }
+
+ /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+ /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+ /// <param name="minDocsFreq">the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
+ /// <param name="minTermFreq">the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
+ public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq)
+ {
+ this._k = k;
+ this._minDocsFreq = minDocsFreq;
+ this._minTermFreq = minTermFreq;
+ }
+
+ public ClassificationResult<BytesRef> AssignClass(String text)
+ {
+ if (_mlt == null)
+ {
+ throw new IOException("You must first call Classifier#train");
+ }
+
+ BooleanQuery mltQuery = new BooleanQuery();
+ foreach (String textFieldName in _textFieldNames)
+ {
+ mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
+ }
+ Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
+ mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
+ if (_query != null) {
+ mltQuery.Add(_query, BooleanClause.Occur.MUST);
+ }
+ TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
+ return SelectClassFromNeighbors(topDocs);
+ }
+
+ private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs)
+ {
+ // TODO : improve the nearest neighbor selection
+ Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
+
+ foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
+ {
+ BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
+ int count = classCounts[cl];
+ if (classCounts.ContainsKey(cl))
+ {
+ classCounts[cl] = count + 1;
+ }
+ else
+ {
+ classCounts.Add(cl, 1);
+ }
+ }
+ double max = 0;
+ BytesRef assignedClass = new BytesRef();
+ foreach (KeyValuePair<BytesRef, int> entry in classCounts)
+ {
+ int count = entry.Value;
+ if (count > max)
+ {
+ max = count;
+ assignedClass = (BytesRef)entry.Key.Clone();
+ }
+ }
+ double score = max / (double) _k;
+ return new ClassificationResult<BytesRef>(assignedClass, score);
+ }
+
+ public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
+ {
+ Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+ }
+
+
+ public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
+ {
+ Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+ }
+
+ public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
+ {
+ this._textFieldNames = textFieldNames;
+ this._classFieldName = classFieldName;
+ _mlt = new MoreLikeThis(atomicReader);
+ _mlt.Analyzer = analyzer;
+ _mlt.FieldNames = _textFieldNames;
+ _indexSearcher = new IndexSearcher(atomicReader);
+ if (_minDocsFreq > 0)
+ {
+ _mlt.MinDocFreq = _minDocsFreq;
+ }
+ if (_minTermFreq > 0)
+ {
+ _mlt.MinTermFreq = _minTermFreq;
+ }
+ this._query = query;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
new file mode 100644
index 0000000..e0bf2e9
--- /dev/null
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProjectGuid>{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>Lucene.Net.Classification</RootNamespace>
+ <AssemblyName>Lucene.Net.Classification</AssemblyName>
+ <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="System" />
+ <Reference Include="System.Core" />
+ <Reference Include="System.Xml.Linq" />
+ <Reference Include="System.Data.DataSetExtensions" />
+ <Reference Include="Microsoft.CSharp" />
+ <Reference Include="System.Data" />
+ <Reference Include="System.Xml" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="ClassificationResult.cs" />
+ <Compile Include="Classifier.cs" />
+ <Compile Include="KNearesteighborClassifier.cs" />
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ <Compile Include="SimpleNaiveBayesClassifier.cs" />
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
+ <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+ <Name>Lucene.Net</Name>
+ </ProjectReference>
+ <ProjectReference Include="..\Lucene.Net.Queries\Lucene.Net.Queries.csproj">
+ <Project>{69D7956C-C2CC-4708-B399-A188FEC384C4}</Project>
+ <Name>Lucene.Net.Queries</Name>
+ </ProjectReference>
+ </ItemGroup>
+ <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.sln b/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
new file mode 100644
index 0000000..9965049
--- /dev/null
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
@@ -0,0 +1,54 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Classification", "Lucene.Net.Classification.csproj", "{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\Lucene.Net.Core\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Queries", "..\Lucene.Net.Queries\Lucene.Net.Queries.csproj", "{69D7956C-C2CC-4708-B399-A188FEC384C4}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Debug|Mixed Platforms = Debug|Mixed Platforms
+ Debug|x86 = Debug|x86
+ Release|Any CPU = Release|Any CPU
+ Release|Mixed Platforms = Release|Mixed Platforms
+ Release|x86 = Release|x86
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Any CPU.Build.0 = Release|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|x86.ActiveCfg = Release|Any CPU
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Mixed Platforms.ActiveCfg = Debug|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Mixed Platforms.Build.0 = Debug|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|x86.ActiveCfg = Debug|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|x86.Build.0 = Debug|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Mixed Platforms.ActiveCfg = Release|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Mixed Platforms.Build.0 = Release|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|x86.ActiveCfg = Release|x86
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|x86.Build.0 = Release|x86
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Any CPU.Build.0 = Release|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|x86.ActiveCfg = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs b/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..ede45e1
--- /dev/null
+++ b/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Classification")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Lucene.Net.Classification")]
+[assembly: AssemblyCopyright("Copyright © 2014")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("ff6180c7-579d-4557-bf6a-ddd139fad2e4")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
new file mode 100644
index 0000000..0980d58
--- /dev/null
+++ b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+ using Lucene.Net.Analysis;
+ using Lucene.Net.Analysis.Tokenattributes;
+ using Lucene.Net.Index;
+ using Lucene.Net.Search;
+ using Lucene.Net.Util;
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+
+ /// <summary>
+ /// A simplistic Lucene based NaiveBayes classifier, see <code>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</code>
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ public class SimpleNaiveBayesClassifier : Classifier<BytesRef>
+ {
+ private AtomicReader _atomicReader;
+ private String[] _textFieldNames;
+ private String _classFieldName;
+ private int _docsWithClassSize;
+ private Analyzer _analyzer;
+ private IndexSearcher _indexSearcher;
+ private Query _query;
+
+ public SimpleNaiveBayesClassifier()
+ {
+ }
+
+ public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
+ {
+ Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+ }
+
+ public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
+ {
+ Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+ }
+
+ public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
+ {
+ this._atomicReader = atomicReader;
+ this._indexSearcher = new IndexSearcher(this._atomicReader);
+ this._textFieldNames = textFieldNames;
+ this._classFieldName = classFieldName;
+ this._analyzer = analyzer;
+ this._query = query;
+ this._docsWithClassSize = CountDocsWithClass();
+ }
+
+ private int CountDocsWithClass()
+ {
+ int docCount = MultiFields.GetTerms(this._atomicReader, this._classFieldName).DocCount;
+ if (docCount == -1)
+ { // in case codec doesn't support getDocCount
+ TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+ BooleanQuery q = new BooleanQuery();
+ q.Add(new BooleanClause(new WildcardQuery(new Term(_classFieldName, WildcardQuery.WILDCARD_STRING.ToString())), BooleanClause.Occur.MUST));
+ if (_query != null)
+ {
+ q.Add(_query, BooleanClause.Occur.MUST);
+ }
+ _indexSearcher.Search(q, totalHitCountCollector);
+ docCount = totalHitCountCollector.TotalHits;
+ }
+ return docCount;
+ }
+
+ private String[] TokenizeDoc(String doc)
+ {
+ ICollection<String> result = new LinkedList<string>();
+ foreach (String textFieldName in _textFieldNames) {
+ TokenStream tokenStream = _analyzer.TokenStream(textFieldName, new StringReader(doc));
+ try
+ {
+ CharTermAttribute charTermAttribute = tokenStream.AddAttribute<CharTermAttribute>();
+ tokenStream.Reset();
+ while (tokenStream.IncrementToken())
+ {
+ result.Add(charTermAttribute.ToString());
+ }
+ tokenStream.End();
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(tokenStream);
+ }
+ }
+ var ret = new string[result.Count];
+ result.CopyTo(ret, 0);
+ return ret;
+ }
+
+ public ClassificationResult<BytesRef> AssignClass(String inputDocument)
+ {
+ if (_atomicReader == null)
+ {
+ throw new IOException("You must first call Classifier#train");
+ }
+ double max = - Double.MaxValue;
+ BytesRef foundClass = new BytesRef();
+
+ Terms terms = MultiFields.GetTerms(_atomicReader, _classFieldName);
+ TermsEnum termsEnum = terms.Iterator(null);
+ BytesRef next;
+ String[] tokenizedDoc = TokenizeDoc(inputDocument);
+ while ((next = termsEnum.Next()) != null)
+ {
+ double clVal = CalculateLogPrior(next) + CalculateLogLikelihood(tokenizedDoc, next);
+ if (clVal > max)
+ {
+ max = clVal;
+ foundClass = BytesRef.DeepCopyOf(next);
+ }
+ }
+ double score = 10 / Math.Abs(max);
+ return new ClassificationResult<BytesRef>(foundClass, score);
+ }
+
+
+ private double CalculateLogLikelihood(String[] tokenizedDoc, BytesRef c)
+ {
+ // for each word
+ double result = 0d;
+ foreach (String word in tokenizedDoc)
+ {
+ // search with text:word AND class:c
+ int hits = GetWordFreqForClass(word, c);
+
+ // num : count the no of times the word appears in documents of class c (+1)
+ double num = hits + 1; // +1 is added because of add 1 smoothing
+
+ // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+ double den = GetTextTermFreqForClass(c) + _docsWithClassSize;
+
+ // P(w|c) = num/den
+ double wordProbability = num / den;
+ result += Math.Log(wordProbability);
+ }
+
+ // log(P(d|c)) = log(P(w1|c))+...+log(P(wn|c))
+ return result;
+ }
+
+ private double GetTextTermFreqForClass(BytesRef c)
+ {
+ double avgNumberOfUniqueTerms = 0;
+ foreach (String textFieldName in _textFieldNames)
+ {
+ Terms terms = MultiFields.GetTerms(_atomicReader, textFieldName);
+ long numPostings = terms.SumDocFreq; // number of term/doc pairs
+ avgNumberOfUniqueTerms += numPostings / (double) terms.DocCount; // avg # of unique terms per doc
+ }
+ int docsWithC = _atomicReader.DocFreq(new Term(_classFieldName, c));
+ return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c
+ }
+
+ private int GetWordFreqForClass(String word, BytesRef c)
+ {
+ BooleanQuery booleanQuery = new BooleanQuery();
+ BooleanQuery subQuery = new BooleanQuery();
+ foreach (String textFieldName in _textFieldNames)
+ {
+ subQuery.Add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD));
+ }
+ booleanQuery.Add(new BooleanClause(subQuery, BooleanClause.Occur.MUST));
+ booleanQuery.Add(new BooleanClause(new TermQuery(new Term(_classFieldName, c)), BooleanClause.Occur.MUST));
+ if (_query != null)
+ {
+ booleanQuery.Add(_query, BooleanClause.Occur.MUST);
+ }
+ TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+ _indexSearcher.Search(booleanQuery, totalHitCountCollector);
+ return totalHitCountCollector.TotalHits;
+ }
+
+ private double CalculateLogPrior(BytesRef currentClass)
+ {
+ return Math.Log((double) DocCount(currentClass)) - Math.Log(_docsWithClassSize);
+ }
+
+ private int DocCount(BytesRef countedClass)
+ {
+ return _atomicReader.DocFreq(new Term(_classFieldName, countedClass));
+ }
+ }
+}
\ No newline at end of file
[2/3] lucenenet git commit: fixes based on feedback (rename classes,
fix indents, remove 'this.')
Posted by sy...@apache.org.
fixes based on feedback (rename classes, fix indents, remove 'this.')
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/583627a1
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/583627a1
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/583627a1
Branch: refs/heads/master
Commit: 583627a12799609a4fd1ceed5487805ebd7dc8b6
Parents: aba955c
Author: Laimonas Simutis <la...@gmail.com>
Authored: Sun Dec 7 07:41:02 2014 -0500
Committer: Laimonas Simutis <la...@gmail.com>
Committed: Sun Dec 7 07:41:02 2014 -0500
----------------------------------------------------------------------
.../ClassificationResult.cs | 7 +-
src/Lucene.Net.Classification/Classifier.cs | 65 --------
src/Lucene.Net.Classification/IClassifier.cs | 65 ++++++++
.../KNearestNeighborClassifier.cs | 151 +++++++++++++++++++
.../KNearesteighborClassifier.cs | 150 ------------------
.../Lucene.Net.Classification.csproj | 4 +-
.../SimpleNaiveBayesClassifier.cs | 18 +--
7 files changed, 231 insertions(+), 229 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/ClassificationResult.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/ClassificationResult.cs b/src/Lucene.Net.Classification/ClassificationResult.cs
index 356ec7e..1db3ed0 100644
--- a/src/Lucene.Net.Classification/ClassificationResult.cs
+++ b/src/Lucene.Net.Classification/ClassificationResult.cs
@@ -21,7 +21,8 @@ namespace Lucene.Net.Classification
/// The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type <code>T</code> and a score.
/// @lucene.experimental
/// </summary>
- public class ClassificationResult<T> {
+ public class ClassificationResult<T>
+ {
private readonly T _assignedClass;
private readonly double _score;
@@ -33,8 +34,8 @@ namespace Lucene.Net.Classification
/// </summary>
public ClassificationResult(T assignedClass, double score)
{
- this._assignedClass = assignedClass;
- this._score = score;
+ _assignedClass = assignedClass;
+ _score = score;
}
/// <summary>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/Classifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Classifier.cs b/src/Lucene.Net.Classification/Classifier.cs
deleted file mode 100644
index 6ffca79..0000000
--- a/src/Lucene.Net.Classification/Classifier.cs
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace Lucene.Net.Classification
-{
- using Lucene.Net.Analysis;
- using Lucene.Net.Index;
- using Lucene.Net.Search;
- using System;
-
- /// <summary>
- /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
- /// <code>T</code>
- /// @lucene.experimental
- /// </summary>
- public interface Classifier<T>
- {
- /// <summary>
- /// Assign a class (with score) to the given text String
- /// </summary>
- /// <param name="text">a String containing text to be classified</param>
- /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code> and score</returns>
- ClassificationResult<T> AssignClass(String text);
-
- /// <summary>
- /// * Train the classifier using the underlying Lucene index
- /// </summary>
- /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen text</param>
- /// <param name="atomicReader">the reader to use to access the Lucene index</param>
- /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
- /// <param name="textFieldName">the name of the field used to compare documents</param>
- void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer);
-
- /// <summary>Train the classifier using the underlying Lucene index</summary>
- /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
- /// <param name="atomicReader">the reader to use to access the Lucene index</param>
- /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
- /// <param name="query">the query to filter which documents use for training</param>
- /// <param name="textFieldName">the name of the field used to compare documents</param>
- void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query);
-
- /// <summary>Train the classifier using the underlying Lucene index</summary>
- /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
- /// <param name="atomicReader">the reader to use to access the Lucene index</param>
- /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
- /// <param name="query">the query to filter which documents use for training</param>
- /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
- void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer,
- Query query);
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/IClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/IClassifier.cs b/src/Lucene.Net.Classification/IClassifier.cs
new file mode 100644
index 0000000..2e05173
--- /dev/null
+++ b/src/Lucene.Net.Classification/IClassifier.cs
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+ using Lucene.Net.Analysis;
+ using Lucene.Net.Index;
+ using Lucene.Net.Search;
+ using System;
+
+ /// <summary>
+ /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
+ /// <code>T</code>
+ /// @lucene.experimental
+ /// </summary>
+ public interface IClassifier<T>
+ {
+ /// <summary>
+ /// Assign a class (with score) to the given text String
+ /// </summary>
+ /// <param name="text">a String containing text to be classified</param>
+ /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code> and score</returns>
+ ClassificationResult<T> AssignClass(String text);
+
+ /// <summary>
+ /// * Train the classifier using the underlying Lucene index
+ /// </summary>
+ /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen text</param>
+ /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+ /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+ /// <param name="textFieldName">the name of the field used to compare documents</param>
+ void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer);
+
+ /// <summary>Train the classifier using the underlying Lucene index</summary>
+ /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+ /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+ /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+ /// <param name="query">the query to filter which documents use for training</param>
+ /// <param name="textFieldName">the name of the field used to compare documents</param>
+ void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query);
+
+ /// <summary>Train the classifier using the underlying Lucene index</summary>
+ /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
+ /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+ /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
+ /// <param name="query">the query to filter which documents use for training</param>
+ /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
+ void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer,
+ Query query);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs b/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs
new file mode 100644
index 0000000..e400254
--- /dev/null
+++ b/src/Lucene.Net.Classification/KNearestNeighborClassifier.cs
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+ using Lucene.Net.Analysis;
+ using Lucene.Net.Index;
+ using Lucene.Net.Queries.Mlt;
+ using Lucene.Net.Search;
+ using Lucene.Net.Util;
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+
+ /// <summary>
+ /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
+ /// on {@link MoreLikeThis}
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ public class KNearestNeighborClassifier : IClassifier<BytesRef>
+ {
+
+ private MoreLikeThis _mlt;
+ private String[] _textFieldNames;
+ private String _classFieldName;
+ private IndexSearcher _indexSearcher;
+ private readonly int _k;
+ private Query _query;
+
+ private int _minDocsFreq;
+ private int _minTermFreq;
+
+ /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+ /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+ public KNearestNeighborClassifier(int k)
+ {
+ _k = k;
+ }
+
+ /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+ /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+ /// <param name="minDocsFreq">the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
+ /// <param name="minTermFreq">the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
+ public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq)
+ {
+ _k = k;
+ _minDocsFreq = minDocsFreq;
+ _minTermFreq = minTermFreq;
+ }
+
+ public ClassificationResult<BytesRef> AssignClass(String text)
+ {
+ if (_mlt == null)
+ {
+ throw new IOException("You must first call Classifier#train");
+ }
+
+ BooleanQuery mltQuery = new BooleanQuery();
+ foreach (String textFieldName in _textFieldNames)
+ {
+ mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
+ }
+ Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
+ mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
+ if (_query != null)
+ {
+ mltQuery.Add(_query, BooleanClause.Occur.MUST);
+ }
+ TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
+ return SelectClassFromNeighbors(topDocs);
+ }
+
+ private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs)
+ {
+ // TODO : improve the nearest neighbor selection
+ Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
+
+ foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
+ {
+ BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
+ int count = classCounts[cl];
+ if (classCounts.ContainsKey(cl))
+ {
+ classCounts[cl] = count + 1;
+ }
+ else
+ {
+ classCounts.Add(cl, 1);
+ }
+ }
+ double max = 0;
+ BytesRef assignedClass = new BytesRef();
+ foreach (KeyValuePair<BytesRef, int> entry in classCounts)
+ {
+ int count = entry.Value;
+ if (count > max)
+ {
+ max = count;
+ assignedClass = (BytesRef)entry.Key.Clone();
+ }
+ }
+ double score = max / (double) _k;
+ return new ClassificationResult<BytesRef>(assignedClass, score);
+ }
+
+ public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
+ {
+ Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+ }
+
+
+ public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
+ {
+ Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+ }
+
+ public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
+ {
+ _textFieldNames = textFieldNames;
+ _classFieldName = classFieldName;
+ _mlt = new MoreLikeThis(atomicReader);
+ _mlt.Analyzer = analyzer;
+ _mlt.FieldNames = _textFieldNames;
+ _indexSearcher = new IndexSearcher(atomicReader);
+ if (_minDocsFreq > 0)
+ {
+ _mlt.MinDocFreq = _minDocsFreq;
+ }
+ if (_minTermFreq > 0)
+ {
+ _mlt.MinTermFreq = _minTermFreq;
+ }
+ _query = query;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
deleted file mode 100644
index c83301e..0000000
--- a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace Lucene.Net.Classification
-{
- using Lucene.Net.Analysis;
- using Lucene.Net.Index;
- using Lucene.Net.Queries.Mlt;
- using Lucene.Net.Search;
- using Lucene.Net.Util;
- using System;
- using System.Collections.Generic;
- using System.IO;
-
- /// <summary>
- /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
- /// on {@link MoreLikeThis}
- ///
- /// @lucene.experimental
- /// </summary>
- public class KNearestNeighborClassifier : Classifier<BytesRef>
- {
-
- private MoreLikeThis _mlt;
- private String[] _textFieldNames;
- private String _classFieldName;
- private IndexSearcher _indexSearcher;
- private readonly int _k;
- private Query _query;
-
- private int _minDocsFreq;
- private int _minTermFreq;
-
- /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
- /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
- public KNearestNeighborClassifier(int k)
- {
- this._k = k;
- }
-
- /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
- /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
- /// <param name="minDocsFreq">the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
- /// <param name="minTermFreq">the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
- public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq)
- {
- this._k = k;
- this._minDocsFreq = minDocsFreq;
- this._minTermFreq = minTermFreq;
- }
-
- public ClassificationResult<BytesRef> AssignClass(String text)
- {
- if (_mlt == null)
- {
- throw new IOException("You must first call Classifier#train");
- }
-
- BooleanQuery mltQuery = new BooleanQuery();
- foreach (String textFieldName in _textFieldNames)
- {
- mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
- }
- Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
- mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
- if (_query != null) {
- mltQuery.Add(_query, BooleanClause.Occur.MUST);
- }
- TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
- return SelectClassFromNeighbors(topDocs);
- }
-
- private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs)
- {
- // TODO : improve the nearest neighbor selection
- Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
-
- foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
- {
- BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
- int count = classCounts[cl];
- if (classCounts.ContainsKey(cl))
- {
- classCounts[cl] = count + 1;
- }
- else
- {
- classCounts.Add(cl, 1);
- }
- }
- double max = 0;
- BytesRef assignedClass = new BytesRef();
- foreach (KeyValuePair<BytesRef, int> entry in classCounts)
- {
- int count = entry.Value;
- if (count > max)
- {
- max = count;
- assignedClass = (BytesRef)entry.Key.Clone();
- }
- }
- double score = max / (double) _k;
- return new ClassificationResult<BytesRef>(assignedClass, score);
- }
-
- public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
- {
- Train(atomicReader, textFieldName, classFieldName, analyzer, null);
- }
-
-
- public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
- {
- Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
- }
-
- public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
- {
- this._textFieldNames = textFieldNames;
- this._classFieldName = classFieldName;
- _mlt = new MoreLikeThis(atomicReader);
- _mlt.Analyzer = analyzer;
- _mlt.FieldNames = _textFieldNames;
- _indexSearcher = new IndexSearcher(atomicReader);
- if (_minDocsFreq > 0)
- {
- _mlt.MinDocFreq = _minDocsFreq;
- }
- if (_minTermFreq > 0)
- {
- _mlt.MinTermFreq = _minTermFreq;
- }
- this._query = query;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
index e0bf2e9..8d31ed5 100644
--- a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -40,8 +40,8 @@
</ItemGroup>
<ItemGroup>
<Compile Include="ClassificationResult.cs" />
- <Compile Include="Classifier.cs" />
- <Compile Include="KNearesteighborClassifier.cs" />
+ <Compile Include="IClassifier.cs" />
+ <Compile Include="KNearestNeighborClassifier.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="SimpleNaiveBayesClassifier.cs" />
</ItemGroup>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/583627a1/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
index 0980d58..a045c80 100644
--- a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
+++ b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
@@ -31,7 +31,7 @@ namespace Lucene.Net.Classification
///
/// @lucene.experimental
/// </summary>
- public class SimpleNaiveBayesClassifier : Classifier<BytesRef>
+ public class SimpleNaiveBayesClassifier : IClassifier<BytesRef>
{
private AtomicReader _atomicReader;
private String[] _textFieldNames;
@@ -57,18 +57,18 @@ namespace Lucene.Net.Classification
public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName, Analyzer analyzer, Query query)
{
- this._atomicReader = atomicReader;
- this._indexSearcher = new IndexSearcher(this._atomicReader);
- this._textFieldNames = textFieldNames;
- this._classFieldName = classFieldName;
- this._analyzer = analyzer;
- this._query = query;
- this._docsWithClassSize = CountDocsWithClass();
+ _atomicReader = atomicReader;
+ _indexSearcher = new IndexSearcher(_atomicReader);
+ _textFieldNames = textFieldNames;
+ _classFieldName = classFieldName;
+ _analyzer = analyzer;
+ _query = query;
+ _docsWithClassSize = CountDocsWithClass();
}
private int CountDocsWithClass()
{
- int docCount = MultiFields.GetTerms(this._atomicReader, this._classFieldName).DocCount;
+ int docCount = MultiFields.GetTerms(_atomicReader, _classFieldName).DocCount;
if (docCount == -1)
{ // in case codec doesn't support getDocCount
TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
[3/3] lucenenet git commit: Bring Query.Equals() to be back in par
with the Java impl
Posted by sy...@apache.org.
Bring Query.Equals() to be back in par with the Java impl
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/c0c10195
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/c0c10195
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/c0c10195
Branch: refs/heads/master
Commit: c0c101953e14398a8e3f01f185a3ecfcecbc1609
Parents: 583627a
Author: Itamar Syn-Hershko <it...@code972.com>
Authored: Mon Dec 22 15:09:35 2014 +0200
Committer: Itamar Syn-Hershko <it...@code972.com>
Committed: Mon Dec 22 15:09:35 2014 +0200
----------------------------------------------------------------------
src/Lucene.Net.Core/Search/Query.cs | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c0c10195/src/Lucene.Net.Core/Search/Query.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Search/Query.cs b/src/Lucene.Net.Core/Search/Query.cs
index d553af6..1448093 100644
--- a/src/Lucene.Net.Core/Search/Query.cs
+++ b/src/Lucene.Net.Core/Search/Query.cs
@@ -134,12 +134,17 @@ namespace Lucene.Net.Search
return true;
}
+ if (GetType() != obj.GetType())
+ {
+ return false;
+ }
+
var other = obj as Query;
if (other == null)
{
return false;
}
-
+
if (Number.FloatToIntBits(Boost) != Number.FloatToIntBits(other.Boost))
{
return false;