You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/12/24 23:43:55 UTC
[2/3] lucenenet git commit: add Util\DataSplitter and corresponding tests

add Util\DataSplitter and corresponding tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/3134b63c
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/3134b63c
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/3134b63c

Branch: refs/heads/master
Commit: 3134b63c48366fd6f56b87dc984f0f97a33eb7fb
Parents: 78d60e3
Author: Laimonas Simutis <la...@gmail.com>
Authored: Tue Dec 23 21:16:07 2014 -0500
Committer: Laimonas Simutis <la...@gmail.com>
Committed: Tue Dec 23 21:16:07 2014 -0500

----------------------------------------------------------------------
 .../Lucene.Net.Classification.csproj            |   1 +
 .../Utils/DatasetSplitter.cs                    | 150 +++++++++++++++++++
 .../Lucene.Net.Tests.Classification.csproj      |   2 +
 .../Utils/DataSplitterTest.cs                   | 145 ++++++++++++++++++
 4 files changed, 298 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
index 8d31ed5..cbefa2c 100644
--- a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -44,6 +44,7 @@
     <Compile Include="KNearestNeighborClassifier.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
     <Compile Include="SimpleNaiveBayesClassifier.cs" />
+    <Compile Include="Utils\DatasetSplitter.cs" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs b/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs
new file mode 100644
index 0000000..e5c64e9
--- /dev/null
+++ b/src/Lucene.Net.Classification/Utils/DatasetSplitter.cs
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Classification.Utils
+{
+    /**
+     * Utility class for creating training / test / cross validation indexes from the original index.
+     */
+    public class DatasetSplitter
+    {
+
+        private readonly double _crossValidationRatio;
+        private readonly double _testRatio;
+
+        /**
+         * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes
+         *
+         * @param testRatio            the ratio of the original index to be used for the test IDX as a <code>double</code> between 0.0 and 1.0
+         * @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a <code>double</code> between 0.0 and 1.0
+         */
+        public DatasetSplitter(double testRatio, double crossValidationRatio)
+        {
+            this._crossValidationRatio = crossValidationRatio;
+            this._testRatio = testRatio;
+        }
+
+        /**
+         * Split a given index into 3 indexes for training, test and cross validation tasks respectively
+         *
+         * @param originalIndex        an {@link AtomicReader} on the source index
+         * @param trainingIndex        a {@link Directory} used to write the training index
+         * @param testIndex            a {@link Directory} used to write the test index
+         * @param crossValidationIndex a {@link Directory} used to write the cross validation index
+         * @param analyzer             {@link Analyzer} used to create the new docs
+         * @param fieldNames           names of fields that need to be put in the new indexes or <code>null</code> if all should be used
+         * @throws IOException if any writing operation fails on any of the indexes
+         */
+        public void Split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, params string[] fieldNames)
+        {
+            // create IWs for train / test / cv IDXs
+            IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Util.Version.LUCENE_CURRENT, analyzer));
+            IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Util.Version.LUCENE_CURRENT, analyzer));
+            IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Util.Version.LUCENE_CURRENT, analyzer));
+
+            try
+            {
+                int size = originalIndex.MaxDoc;
+
+                IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
+                TopDocs topDocs = indexSearcher.Search(new MatchAllDocsQuery(), Int32.MaxValue);
+
+                // set the type to be indexed, stored, with term vectors
+                FieldType ft = new FieldType(TextField.TYPE_STORED);
+                ft.StoreTermVectors = true;
+                ft.StoreTermVectorOffsets = true;
+                ft.StoreTermVectorPositions = true;
+
+                int b = 0;
+
+                // iterate over existing documents
+                foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
+                {
+                    // create a new document for indexing
+                    Document doc = new Document();
+                    if (fieldNames != null && fieldNames.Length > 0)
+                    {
+                        foreach (String fieldName in fieldNames)
+                        {
+                            doc.Add(new Field(fieldName, originalIndex.Document(scoreDoc.Doc).GetField(fieldName).ToString(), ft));
+                        }
+                    }
+                    else
+                    {
+                        foreach (IndexableField storableField in originalIndex.Document(scoreDoc.Doc).Fields)
+                        {
+                            if (storableField.ReaderValue != null)
+                            {
+                                doc.Add(new Field(storableField.Name(), storableField.ReaderValue, ft));
+                            }
+                            else if (storableField.BinaryValue() != null)
+                            {
+                                doc.Add(new Field(storableField.Name(), storableField.BinaryValue(), ft));
+                            }
+                            else if (storableField.StringValue != null)
+                            {
+                                doc.Add(new Field(storableField.Name(), storableField.StringValue, ft));
+                            }
+                            else if (storableField.NumericValue != null)
+                            {
+                                doc.Add(new Field(storableField.Name(), storableField.NumericValue.ToString(), ft));
+                            }
+                        }
+                    }
+
+                    // add it to one of the IDXs
+                    if (b % 2 == 0 && testWriter.MaxDoc < size * _testRatio)
+                    {
+                        testWriter.AddDocument(doc);
+                    }
+                    else if (cvWriter.MaxDoc < size * _crossValidationRatio)
+                    {
+                        cvWriter.AddDocument(doc);
+                    }
+                    else
+                    {
+                        trainingWriter.AddDocument(doc);
+                    }
+                    b++;
+                }
+            }
+            catch (Exception e)
+            {
+                throw new IOException("Exceptio in DatasetSplitter", e);
+            }
+            finally
+            {
+                testWriter.Commit();
+                cvWriter.Commit();
+                trainingWriter.Commit();
+                // close IWs
+                testWriter.Dispose();
+                cvWriter.Dispose();
+                trainingWriter.Dispose();
+            }
+        }
+
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj b/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
index 164c8f5..693aacd 100644
--- a/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
+++ b/src/Lucene.Net.Tests.Classification/Lucene.Net.Tests.Classification.csproj
@@ -50,6 +50,7 @@
     <Compile Include="KNearestNeighborClassifierTest.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
     <Compile Include="SimpleNaiveBayesClassifierTest.cs" />
+    <Compile Include="Utils\DataSplitterTest.cs" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\Lucene.Net.Classification\Lucene.Net.Classification.csproj">
@@ -64,6 +65,7 @@
   <ItemGroup>
     <None Include="app.config" />
   </ItemGroup>
+  <ItemGroup />
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3134b63c/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs b/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs
new file mode 100644
index 0000000..637f12d
--- /dev/null
+++ b/src/Lucene.Net.Tests.Classification/Utils/DataSplitterTest.cs
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Classification.Utils;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using NUnit.Framework;
+
+namespace Lucene.Net.Classification
+{
+/**
+ * Testcase for {@link org.apache.lucene.classification.utils.DatasetSplitter}
+ */
+    public class DataSplitterTest : Util.LuceneTestCase 
+    {
+      private AtomicReader _originalIndex;
+      private RandomIndexWriter _indexWriter;
+      private Directory _dir;
+
+      private String _textFieldName = "text";
+      private String _classFieldName = "class";
+      private String _idFieldName = "id";
+
+      [SetUp]
+      public override void SetUp()
+      {
+        base.SetUp();
+        _dir = NewDirectory();
+        _indexWriter = new RandomIndexWriter(Random(), _dir, new MockAnalyzer(Random()));
+
+        FieldType ft = new FieldType(TextField.TYPE_STORED);
+        ft.StoreTermVectors = true;
+        ft.StoreTermVectorOffsets = true;
+        ft.StoreTermVectorPositions = true;
+
+        Analyzer analyzer = new MockAnalyzer(Random());
+
+        Document doc;
+        for (int i = 0; i < 100; i++) 
+        {
+          doc = new Document();
+          doc.Add(new Field(_idFieldName, Random().toString(), ft));
+          doc.Add(new Field(_textFieldName, new StringBuilder(Random().toString()).append(Random().toString()).append(
+              Random().toString()).toString(), ft));
+          doc.Add(new Field(_classFieldName, Random().toString(), ft));
+          _indexWriter.AddDocument(doc, analyzer);
+        }
+
+        _indexWriter.Commit();
+
+        _originalIndex = SlowCompositeReaderWrapper.Wrap(_indexWriter.Reader);
+      }
+
+      [TearDown]
+      public override void TearDown()
+      {
+        _originalIndex.Dispose();
+        _indexWriter.Dispose();
+        _dir.Dispose();
+        base.TearDown();
+      }
+
+      [Test]
+      public void TestSplitOnAllFields() 
+      {
+        AssertSplit(_originalIndex, 0.1, 0.1);
+      }
+
+
+      [Test]
+      public void TestSplitOnSomeFields() 
+      {
+        AssertSplit(_originalIndex, 0.2, 0.35, _idFieldName, _textFieldName);
+      }
+
+      public static void AssertSplit(AtomicReader originalIndex, double testRatio, double crossValidationRatio, params string[] fieldNames) 
+      {
+        BaseDirectoryWrapper trainingIndex = NewDirectory();
+        BaseDirectoryWrapper testIndex = NewDirectory();
+        BaseDirectoryWrapper crossValidationIndex = NewDirectory();
+
+        try 
+        {
+          DatasetSplitter datasetSplitter = new DatasetSplitter(testRatio, crossValidationRatio);
+          datasetSplitter.Split(originalIndex, trainingIndex, testIndex, crossValidationIndex, new MockAnalyzer(Random()), fieldNames);
+
+          NotNull(trainingIndex);
+          NotNull(testIndex);
+          NotNull(crossValidationIndex);
+
+          DirectoryReader trainingReader = DirectoryReader.Open(trainingIndex);
+          True((int) (originalIndex.MaxDoc * (1d - testRatio - crossValidationRatio)) == trainingReader.MaxDoc);
+          DirectoryReader testReader = DirectoryReader.Open(testIndex);
+          True((int) (originalIndex.MaxDoc * testRatio) == testReader.MaxDoc);
+          DirectoryReader cvReader = DirectoryReader.Open(crossValidationIndex);
+          True((int) (originalIndex.MaxDoc * crossValidationRatio) == cvReader.MaxDoc);
+
+          trainingReader.Dispose();
+          testReader.Dispose();
+          cvReader.Dispose();
+          CloseQuietly(trainingReader);
+          CloseQuietly(testReader);
+          CloseQuietly(cvReader);
+        } 
+        finally 
+        {
+          trainingIndex.Dispose();
+          testIndex.Dispose();
+          crossValidationIndex.Dispose();
+        }
+      }
+
+      private static void CloseQuietly(IndexReader reader)
+      {
+        try 
+        {
+          if (reader != null)
+            reader.Dispose();
+        } 
+        catch (Exception e) 
+        {
+          // do nothing
+        }
+      }
+    }
+
+}
\ No newline at end of file