You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/12/24 23:43:55 UTC
[2/3] lucenenet git commit: add Util\DataSplitter and corresponding
tests
add Util\DataSplitter and corresponding tests
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/3134b63c
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/3134b63c
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/3134b63c
Branch: refs/heads/master
Commit: 3134b63c48366fd6f56b87dc984f0f97a33eb7fb
Parents: 78d60e3
Author: Laimonas Simutis <la...@gmail.com>
Authored: Tue Dec 23 21:16:07 2014 -0500
Committer: Laimonas Simutis <la...@gmail.com>
Committed: Tue Dec 23 21:16:07 2014 -0500
----------------------------------------------------------------------
.../Lucene.Net.Classification.csproj | 1 +
.../Utils/DatasetSplitter.cs | 150 +++++++++++++++++++
.../Lucene.Net.Tests.Classification.csproj | 2 +
.../Utils/DataSplitterTest.cs | 145 ++++++++++++++++++
4 files changed, 298 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
index 8d31ed5..cbefa2c 100644
--- a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -44,6 +44,7 @@
<Compile Include="KNearestNeighborClassifier.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="SimpleNaiveBayesClassifier.cs" />
+ <Compile Include="Utils\DatasetSplitter.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs b/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs
new file mode 100644
index 0000000..e5c64e9
--- /dev/null
+++ b/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Classification.Utils
+{
+ /**
+ * Utility class for creating training / test / cross validation indexes from the original index.
+ */
+ public class DatasetSplitter
+ {
+
+ private readonly double _crossValidationRatio;
+ private readonly double _testRatio;
+
+ /**
+ * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes
+ *
+ * @param testRatio the ratio of the original index to be used for the test IDX as a <code>double</code> between 0.0 and 1.0
+ * @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a <code>double</code> between 0.0 and 1.0
+ */
+ public DatasetSplitter(double testRatio, double crossValidationRatio)
+ {
+ this._crossValidationRatio = crossValidationRatio;
+ this._testRatio = testRatio;
+ }
+
+ /**
+ * Split a given index into 3 indexes for training, test and cross validation tasks respectively
+ *
+ * @param originalIndex an {@link AtomicReader} on the source index
+ * @param trainingIndex a {@link Directory} used to write the training index
+ * @param testIndex a {@link Directory} used to write the test index
+ * @param crossValidationIndex a {@link Directory} used to write the cross validation index
+ * @param analyzer {@link Analyzer} used to create the new docs
+ * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used
+ * @throws IOException if any writing operation fails on any of the indexes
+ */
+ public void Split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, params string[] fieldNames)
+ {
+ // create IWs for train / test / cv IDXs
+ IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Util.Version.LUCENE_CURRENT, analyzer));
+ IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Util.Version.LUCENE_CURRENT, analyzer));
+ IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Util.Version.LUCENE_CURRENT, analyzer));
+
+ try
+ {
+ int size = originalIndex.MaxDoc;
+
+ IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
+ TopDocs topDocs = indexSearcher.Search(new MatchAllDocsQuery(), Int32.MaxValue);
+
+ // set the type to be indexed, stored, with term vectors
+ FieldType ft = new FieldType(TextField.TYPE_STORED);
+ ft.StoreTermVectors = true;
+ ft.StoreTermVectorOffsets = true;
+ ft.StoreTermVectorPositions = true;
+
+ int b = 0;
+
+ // iterate over existing documents
+ foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
+ {
+ // create a new document for indexing
+ Document doc = new Document();
+ if (fieldNames != null && fieldNames.Length > 0)
+ {
+ foreach (String fieldName in fieldNames)
+ {
+ doc.Add(new Field(fieldName, originalIndex.Document(scoreDoc.Doc).GetField(fieldName).ToString(), ft));
+ }
+ }
+ else
+ {
+ foreach (IndexableField storableField in originalIndex.Document(scoreDoc.Doc).Fields)
+ {
+ if (storableField.ReaderValue != null)
+ {
+ doc.Add(new Field(storableField.Name(), storableField.ReaderValue, ft));
+ }
+ else if (storableField.BinaryValue() != null)
+ {
+ doc.Add(new Field(storableField.Name(), storableField.BinaryValue(), ft));
+ }
+ else if (storableField.StringValue != null)
+ {
+ doc.Add(new Field(storableField.Name(), storableField.StringValue, ft));
+ }
+ else if (storableField.NumericValue != null)
+ {
+ doc.Add(new Field(storableField.Name(), storableField.NumericValue.ToString(), ft));
+ }
+ }
+ }
+
+ // add it to one of the IDXs
+ if (b % 2 == 0 && testWriter.MaxDoc < size * _testRatio)
+ {
+ testWriter.AddDocument(doc);
+ }
+ else if (cvWriter.MaxDoc < size * _crossValidationRatio)
+ {
+ cvWriter.AddDocument(doc);
+ }
+ else
+ {
+ trainingWriter.AddDocument(doc);
+ }
+ b++;
+ }
+ }
+ catch (Exception e)
+ {
+ throw new IOException("Exceptio in DatasetSplitter", e);
+ }
+ finally
+ {
+ testWriter.Commit();
+ cvWriter.Commit();
+ trainingWriter.Commit();
+ // close IWs
+ testWriter.Dispose();
+ cvWriter.Dispose();
+ trainingWriter.Dispose();
+ }
+ }
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj b/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
index 164c8f5..693aacd 100644
--- a/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
+++ b/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
@@ -50,6 +50,7 @@
<Compile Include="KNearestNeighborClassifierTest.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="SimpleNaiveBayesClassifierTest.cs" />
+ <Compile Include="Utils\DataSplitterTest.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Lucene.Net.Classification\Lucene.Net.Classification.csproj">
@@ -64,6 +65,7 @@
<ItemGroup>
<None Include="app.config" />
</ItemGroup>
+ <ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs b/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs
new file mode 100644
index 0000000..637f12d
--- /dev/null
+++ b/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Classification.Utils;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using NUnit.Framework;
+
+namespace Lucene.Net.Classification
+{
+/**
+ * Testcase for {@link org.apache.lucene.classification.utils.DatasetSplitter}
+ */
+ public class DataSplitterTest : Util.LuceneTestCase
+ {
+ private AtomicReader _originalIndex;
+ private RandomIndexWriter _indexWriter;
+ private Directory _dir;
+
+ private String _textFieldName = "text";
+ private String _classFieldName = "class";
+ private String _idFieldName = "id";
+
+ [SetUp]
+ public override void SetUp()
+ {
+ base.SetUp();
+ _dir = NewDirectory();
+ _indexWriter = new RandomIndexWriter(Random(), _dir, new MockAnalyzer(Random()));
+
+ FieldType ft = new FieldType(TextField.TYPE_STORED);
+ ft.StoreTermVectors = true;
+ ft.StoreTermVectorOffsets = true;
+ ft.StoreTermVectorPositions = true;
+
+ Analyzer analyzer = new MockAnalyzer(Random());
+
+ Document doc;
+ for (int i = 0; i < 100; i++)
+ {
+ doc = new Document();
+ doc.Add(new Field(_idFieldName, Random().toString(), ft));
+ doc.Add(new Field(_textFieldName, new StringBuilder(Random().toString()).append(Random().toString()).append(
+ Random().toString()).toString(), ft));
+ doc.Add(new Field(_classFieldName, Random().toString(), ft));
+ _indexWriter.AddDocument(doc, analyzer);
+ }
+
+ _indexWriter.Commit();
+
+ _originalIndex = SlowCompositeReaderWrapper.Wrap(_indexWriter.Reader);
+ }
+
+ [TearDown]
+ public override void TearDown()
+ {
+ _originalIndex.Dispose();
+ _indexWriter.Dispose();
+ _dir.Dispose();
+ base.TearDown();
+ }
+
+ [Test]
+ public void TestSplitOnAllFields()
+ {
+ AssertSplit(_originalIndex, 0.1, 0.1);
+ }
+
+
+ [Test]
+ public void TestSplitOnSomeFields()
+ {
+ AssertSplit(_originalIndex, 0.2, 0.35, _idFieldName, _textFieldName);
+ }
+
+ public static void AssertSplit(AtomicReader originalIndex, double testRatio, double crossValidationRatio, params string[] fieldNames)
+ {
+ BaseDirectoryWrapper trainingIndex = NewDirectory();
+ BaseDirectoryWrapper testIndex = NewDirectory();
+ BaseDirectoryWrapper crossValidationIndex = NewDirectory();
+
+ try
+ {
+ DatasetSplitter datasetSplitter = new DatasetSplitter(testRatio, crossValidationRatio);
+ datasetSplitter.Split(originalIndex, trainingIndex, testIndex, crossValidationIndex, new MockAnalyzer(Random()), fieldNames);
+
+ NotNull(trainingIndex);
+ NotNull(testIndex);
+ NotNull(crossValidationIndex);
+
+ DirectoryReader trainingReader = DirectoryReader.Open(trainingIndex);
+ True((int) (originalIndex.MaxDoc * (1d - testRatio - crossValidationRatio)) == trainingReader.MaxDoc);
+ DirectoryReader testReader = DirectoryReader.Open(testIndex);
+ True((int) (originalIndex.MaxDoc * testRatio) == testReader.MaxDoc);
+ DirectoryReader cvReader = DirectoryReader.Open(crossValidationIndex);
+ True((int) (originalIndex.MaxDoc * crossValidationRatio) == cvReader.MaxDoc);
+
+ trainingReader.Dispose();
+ testReader.Dispose();
+ cvReader.Dispose();
+ CloseQuietly(trainingReader);
+ CloseQuietly(testReader);
+ CloseQuietly(cvReader);
+ }
+ finally
+ {
+ trainingIndex.Dispose();
+ testIndex.Dispose();
+ crossValidationIndex.Dispose();
+ }
+ }
+
+ private static void CloseQuietly(IndexReader reader)
+ {
+ try
+ {
+ if (reader != null)
+ reader.Dispose();
+ }
+ catch (Exception e)
+ {
+ // do nothing
+ }
+ }
+ }
+
+}
\ No newline at end of file