You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/14 12:59:21 UTC

[06/26] lucenenet git commit: first commit of facet porting, failing tests will be fixed in next commits.

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/src/Lucene.Net.Tests/core/Facet/TestMultipleIndexFields.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests/core/Facet/TestMultipleIndexFields.cs b/src/Lucene.Net.Tests/core/Facet/TestMultipleIndexFields.cs
new file mode 100644
index 0000000..490836b
--- /dev/null
+++ b/src/Lucene.Net.Tests/core/Facet/TestMultipleIndexFields.cs
@@ -0,0 +1,300 @@
+using System.Collections.Generic;
+using NUnit.Framework;
+
+namespace Lucene.Net.Facet
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+
+    using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer;
+    using MockTokenizer = Lucene.Net.Analysis.MockTokenizer;
+    using Document = Lucene.Net.Documents.Document;
+    using Field = Lucene.Net.Documents.Field;
+    using TextField = Lucene.Net.Documents.TextField;
+    using TaxonomyReader = Lucene.Net.Facet.Taxonomy.TaxonomyReader;
+    using TaxonomyWriter = Lucene.Net.Facet.Taxonomy.TaxonomyWriter;
+    using DirectoryTaxonomyReader = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyReader;
+    using DirectoryTaxonomyWriter = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyWriter;
+    using AtomicReader = Lucene.Net.Index.AtomicReader;
+    using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
+    using IndexReader = Lucene.Net.Index.IndexReader;
+    using OpenMode = Lucene.Net.Index.IndexWriterConfig.OpenMode_e;
+    using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter;
+    using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+    using MatchAllDocsQuery = Lucene.Net.Search.MatchAllDocsQuery;
+    using Directory = Lucene.Net.Store.Directory;
+    using IOUtils = Lucene.Net.Util.IOUtils;
+
+    public class TestMultipleIndexFields : FacetTestCase
+    {
+
+        private static readonly FacetField[] CATEGORIES = new FacetField[] { new FacetField("Author", "Mark Twain"), new FacetField("Author", "Stephen King"), new FacetField("Author", "Kurt Vonnegut"), new FacetField("Band", "Rock & Pop", "The Beatles"), new FacetField("Band", "Punk", "The Ramones"), new FacetField("Band", "Rock & Pop", "U2"), new FacetField("Band", "Rock & Pop", "REM"), new FacetField("Band", "Rock & Pop", "Dave Matthews Band"), new FacetField("Composer", "Bach") };
+
+        private FacetsConfig Config
+        {
+            get
+            {
+                FacetsConfig config = new FacetsConfig();
+                config.SetHierarchical("Band", true);
+                return config;
+            }
+        }
+
+        [Test]
+        public virtual void TestDefault()
+        {
+            Directory indexDir = NewDirectory();
+            Directory taxoDir = NewDirectory();
+
+            // create and open an index writer
+            var iw = new RandomIndexWriter(Random(), indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)));
+            // create and open a taxonomy writer
+            var tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
+            var config = Config;
+
+            seedIndex(tw, iw, config);
+
+            IndexReader ir = iw.Reader;
+            tw.Commit();
+
+            // prepare index reader and taxonomy.
+            var tr = new DirectoryTaxonomyReader(taxoDir);
+
+            // prepare searcher to search against
+            IndexSearcher searcher = NewSearcher(ir);
+
+            FacetsCollector sfc = performSearch(tr, ir, searcher);
+
+            // Obtain facets results and hand-test them
+            AssertCorrectResults(GetTaxonomyFacetCounts(tr, config, sfc));
+
+            assertOrdinalsExist("$facets", ir);
+
+            IOUtils.Close(tr, ir, iw, tw, indexDir, taxoDir);
+        }
+
+        [Test]
+        public virtual void TestCustom()
+        {
+            Directory indexDir = NewDirectory();
+            Directory taxoDir = NewDirectory();
+
+            // create and open an index writer
+            RandomIndexWriter iw = new RandomIndexWriter(Random(), indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)));
+            // create and open a taxonomy writer
+            TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
+
+            FacetsConfig config = Config;
+            config.SetIndexFieldName("Author", "$author");
+            seedIndex(tw, iw, config);
+
+            IndexReader ir = iw.Reader;
+            tw.Commit();
+
+            // prepare index reader and taxonomy.
+            var tr = new DirectoryTaxonomyReader(taxoDir);
+
+            // prepare searcher to search against
+            IndexSearcher searcher = NewSearcher(ir);
+
+            FacetsCollector sfc = performSearch(tr, ir, searcher);
+
+            IDictionary<string, Facets> facetsMap = new Dictionary<string, Facets>();
+            facetsMap["Author"] = GetTaxonomyFacetCounts(tr, config, sfc, "$author");
+            Facets facets = new MultiFacets(facetsMap, GetTaxonomyFacetCounts(tr, config, sfc));
+
+            // Obtain facets results and hand-test them
+            AssertCorrectResults(facets);
+
+            assertOrdinalsExist("$facets", ir);
+            assertOrdinalsExist("$author", ir);
+
+            IOUtils.Close(tr, ir, iw, tw, indexDir, taxoDir);
+        }
+
+        [Test]
+        public virtual void TestTwoCustomsSameField()
+        {
+            Directory indexDir = NewDirectory();
+            Directory taxoDir = NewDirectory();
+
+            // create and open an index writer
+            RandomIndexWriter iw = new RandomIndexWriter(Random(), indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)));
+            // create and open a taxonomy writer
+            var tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
+
+            FacetsConfig config = Config;
+            config.SetIndexFieldName("Band", "$music");
+            config.SetIndexFieldName("Composer", "$music");
+            seedIndex(tw, iw, config);
+
+            IndexReader ir = iw.Reader;
+            tw.Commit();
+
+            // prepare index reader and taxonomy.
+            var tr = new DirectoryTaxonomyReader(taxoDir);
+
+            // prepare searcher to search against
+            IndexSearcher searcher = NewSearcher(ir);
+
+            FacetsCollector sfc = performSearch(tr, ir, searcher);
+
+            IDictionary<string, Facets> facetsMap = new Dictionary<string, Facets>();
+            Facets facets2 = GetTaxonomyFacetCounts(tr, config, sfc, "$music");
+            facetsMap["Band"] = facets2;
+            facetsMap["Composer"] = facets2;
+            Facets facets = new MultiFacets(facetsMap, GetTaxonomyFacetCounts(tr, config, sfc));
+
+            // Obtain facets results and hand-test them
+            AssertCorrectResults(facets);
+
+            assertOrdinalsExist("$facets", ir);
+            assertOrdinalsExist("$music", ir);
+            assertOrdinalsExist("$music", ir);
+
+            IOUtils.Close(tr, ir, iw, tw, indexDir, taxoDir);
+        }
+
+        private void assertOrdinalsExist(string field, IndexReader ir)
+        {
+            foreach (AtomicReaderContext context in ir.Leaves)
+            {
+                AtomicReader r = context.AtomicReader;
+                if (r.GetBinaryDocValues(field) != null)
+                {
+                    return; // not all segments must have this DocValues
+                }
+            }
+            Fail("no ordinals found for " + field);
+        }
+
+        [Test]
+        public virtual void TestDifferentFieldsAndText()
+        {
+            Directory indexDir = NewDirectory();
+            Directory taxoDir = NewDirectory();
+
+            // create and open an index writer
+            var iw = new RandomIndexWriter(Random(), indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)));
+            // create and open a taxonomy writer
+            var tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
+
+            FacetsConfig config = Config;
+            config.SetIndexFieldName("Band", "$bands");
+            config.SetIndexFieldName("Composer", "$composers");
+            seedIndex(tw, iw, config);
+
+            IndexReader ir = iw.Reader;
+            tw.Commit();
+
+            // prepare index reader and taxonomy.
+            var tr = new DirectoryTaxonomyReader(taxoDir);
+
+            // prepare searcher to search against
+            IndexSearcher searcher = NewSearcher(ir);
+
+            FacetsCollector sfc = performSearch(tr, ir, searcher);
+
+            IDictionary<string, Facets> facetsMap = new Dictionary<string, Facets>();
+            facetsMap["Band"] = GetTaxonomyFacetCounts(tr, config, sfc, "$bands");
+            facetsMap["Composer"] = GetTaxonomyFacetCounts(tr, config, sfc, "$composers");
+            Facets facets = new MultiFacets(facetsMap, GetTaxonomyFacetCounts(tr, config, sfc));
+
+            // Obtain facets results and hand-test them
+            AssertCorrectResults(facets);
+            assertOrdinalsExist("$facets", ir);
+            assertOrdinalsExist("$bands", ir);
+            assertOrdinalsExist("$composers", ir);
+
+            IOUtils.Close(tr, ir, iw, tw, indexDir, taxoDir);
+        }
+
+        [Test]
+        public virtual void TestSomeSameSomeDifferent()
+        {
+            Directory indexDir = NewDirectory();
+            Directory taxoDir = NewDirectory();
+
+            // create and open an index writer
+            RandomIndexWriter iw = new RandomIndexWriter(Random(), indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)));
+            // create and open a taxonomy writer
+            TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
+
+            FacetsConfig config = Config;
+            config.SetIndexFieldName("Band", "$music");
+            config.SetIndexFieldName("Composer", "$music");
+            config.SetIndexFieldName("Author", "$literature");
+            seedIndex(tw, iw, config);
+
+            IndexReader ir = iw.Reader;
+            tw.Commit();
+
+            // prepare index reader and taxonomy.
+            var tr = new DirectoryTaxonomyReader(taxoDir);
+
+            // prepare searcher to search against
+            IndexSearcher searcher = NewSearcher(ir);
+
+            FacetsCollector sfc = performSearch(tr, ir, searcher);
+
+            IDictionary<string, Facets> facetsMap = new Dictionary<string, Facets>();
+            Facets facets2 = GetTaxonomyFacetCounts(tr, config, sfc, "$music");
+            facetsMap["Band"] = facets2;
+            facetsMap["Composer"] = facets2;
+            facetsMap["Author"] = GetTaxonomyFacetCounts(tr, config, sfc, "$literature");
+            Facets facets = new MultiFacets(facetsMap, GetTaxonomyFacetCounts(tr, config, sfc));
+
+            // Obtain facets results and hand-test them
+            AssertCorrectResults(facets);
+            assertOrdinalsExist("$music", ir);
+            assertOrdinalsExist("$literature", ir);
+
+            IOUtils.Close(tr, ir, iw, tw);
+            IOUtils.Close(indexDir, taxoDir);
+        }
+
+        
+        private void AssertCorrectResults(Facets facets)
+        {
+            Assert.AreEqual(5, facets.GetSpecificValue("Band"));
+            Assert.AreEqual("dim=Band path=[] value=5 childCount=2\n  Rock & Pop (4)\n  Punk (1)\n", facets.GetTopChildren(10, "Band").ToString());
+            Assert.AreEqual("dim=Band path=[Rock & Pop] value=4 childCount=4\n  The Beatles (1)\n  U2 (1)\n  REM (1)\n  Dave Matthews Band (1)\n", facets.GetTopChildren(10, "Band", "Rock & Pop").ToString());
+            Assert.AreEqual("dim=Author path=[] value=3 childCount=3\n  Mark Twain (1)\n  Stephen King (1)\n  Kurt Vonnegut (1)\n", facets.GetTopChildren(10, "Author").ToString());
+        }
+
+        
+        private static FacetsCollector performSearch(TaxonomyReader tr, IndexReader ir, IndexSearcher searcher)
+        {
+            FacetsCollector fc = new FacetsCollector();
+            FacetsCollector.Search(searcher, new MatchAllDocsQuery(), 10, fc);
+            return fc;
+        }
+
+        private static void seedIndex(TaxonomyWriter tw, RandomIndexWriter iw, FacetsConfig config)
+        {
+            foreach (FacetField ff in CATEGORIES)
+            {
+                Document doc = new Document();
+                doc.Add(ff);
+                doc.Add(new TextField("content", "alpha", Field.Store.YES));
+                iw.AddDocument(config.Build(tw, doc));
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/src/Lucene.Net.Tests/core/Facet/TestRandomSamplingFacetsCollector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests/core/Facet/TestRandomSamplingFacetsCollector.cs b/src/Lucene.Net.Tests/core/Facet/TestRandomSamplingFacetsCollector.cs
new file mode 100644
index 0000000..3ada2c5
--- /dev/null
+++ b/src/Lucene.Net.Tests/core/Facet/TestRandomSamplingFacetsCollector.cs
@@ -0,0 +1,154 @@
+using System;
+using System.Diagnostics;
+using Lucene.Net.Randomized.Generators;
+using NUnit.Framework;
+
+namespace Lucene.Net.Facet
+{
+
+    using Document = Lucene.Net.Documents.Document;
+    using Store = Lucene.Net.Documents.Field.Store;
+    using StringField = Lucene.Net.Documents.StringField;
+    using MatchingDocs = Lucene.Net.Facet.FacetsCollector.MatchingDocs;
+    using FastTaxonomyFacetCounts = Lucene.Net.Facet.Taxonomy.FastTaxonomyFacetCounts;
+    using TaxonomyReader = Lucene.Net.Facet.Taxonomy.TaxonomyReader;
+    using DirectoryTaxonomyReader = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyReader;
+    using DirectoryTaxonomyWriter = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyWriter;
+    using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter;
+    using Term = Lucene.Net.Index.Term;
+    using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+    using MultiCollector = Lucene.Net.Search.MultiCollector;
+    using TermQuery = Lucene.Net.Search.TermQuery;
+    using Directory = Lucene.Net.Store.Directory;
+    using IOUtils = Lucene.Net.Util.IOUtils;
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestRandomSamplingFacetsCollector : FacetTestCase
+    {
+
+        [Test]
+        public virtual void TestRandomSampling()
+        {
+            Directory dir = NewDirectory();
+            Directory taxoDir = NewDirectory();
+
+            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
+            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir);
+
+            FacetsConfig config = new FacetsConfig();
+
+            int numDocs = AtLeast(10000);
+            for (int i = 0; i < numDocs; i++)
+            {
+                Document doc = new Document();
+                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
+                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10)));
+                writer.AddDocument(config.Build(taxoWriter, doc));
+            }
+            Random random = Random();
+
+            // NRT open
+            IndexSearcher searcher = NewSearcher(writer.Reader);
+            var taxoReader = new DirectoryTaxonomyReader(taxoWriter);
+            IOUtils.Close(writer, taxoWriter);
+
+            // Test empty results
+            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong());
+
+            // There should be no divisions by zero
+            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
+
+            // There should be no divisions by zero and no null result
+            Assert.NotNull(collectRandomZeroResults.GetMatchingDocs);
+
+            // There should be no results at all
+            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs)
+            {
+                Assert.AreEqual(0, doc.totalHits);
+            }
+
+            // Now start searching and retrieve results.
+
+            // Use a query to select half of the documents.
+            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
+
+            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
+            // 10) are hits.
+            // there is a REAL small chance that one of the 5 values will be missed when
+            // sampling.
+            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
+            // missing) ~ 10^-193
+            // so that is probably not going to happen.
+            int maxNumChildren = 5;
+
+            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextLong()); // no sampling
+            RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // 10 % of total docs, 20% of the hits
+
+            FacetsCollector fc = new FacetsCollector();
+
+            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));
+
+            FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
+            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
+            FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc);
+
+            FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
+            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
+            FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10");
+
+            Assert.AreEqual(random100Result, exactResult);
+
+            // we should have five children, but there is a small chance we have less.
+            // (see above).
+            Assert.True(random10Result.childCount <= maxNumChildren);
+            // there should be one child at least.
+            Assert.True(random10Result.childCount >= 1);
+
+            // now calculate some statistics to determine if the sampled result is 'ok'.
+            // because random sampling is used, the results will vary each time.
+            int sum = 0;
+            foreach (LabelAndValue lav in random10Result.labelValues)
+            {
+                sum += (int)lav.value;
+            }
+            float mu = (float)sum / (float)maxNumChildren;
+
+            float variance = 0;
+            foreach (LabelAndValue lav in random10Result.labelValues)
+            {
+                variance += (float)Math.Pow((mu - (int)lav.value), 2);
+            }
+            variance = variance / maxNumChildren;
+            float sigma = (float)Math.Sqrt(variance);
+
+            // we query only half the documents and have 5 categories. The average
+            // number of docs in a category will thus be the total divided by 5*2
+            float targetMu = numDocs / (5.0f * 2.0f);
+
+            // the average should be in the range and the standard deviation should not
+            // be too great
+            Assert.True(sigma < 200);
+            Assert.True(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);
+
+            IOUtils.Close(searcher.IndexReader, taxoReader, dir, taxoDir);
+        }
+
+    }
+
+}
\ No newline at end of file