You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/08/06 17:59:08 UTC
[10/33] lucenenet git commit: Ported Lucene.Net.Benchmark + tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs
new file mode 100644
index 0000000..c83828c
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs
@@ -0,0 +1,114 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Adds fields appropriate for sorting: country, random_string and sort_field
+ /// (int). Supports the following parameters:
+ /// <list type="bullet">
+ /// <item><term><b>sort.rng</b></term><description>defines the range for sort-by-int field (default <b>20000</b>).</description></item>
+ /// <item><term><b>rand.seed</b></term><description>defines the seed to initialize Random with (default <b>13</b>).</description></item>
+ /// </list>
+ /// </summary>
+ public class SortableSingleDocSource : SingleDocSource
+ {
+ private static string[] COUNTRIES = new string[] {
+ "European Union", "United States", "Japan", "Germany", "China (PRC)",
+ "United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia",
+ "India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey",
+ "Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway",
+ "Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran",
+ "South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela",
+ "Portugal", "Hong Kong", "United Arab Emirates", "Malaysia",
+ "Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel",
+ "Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria",
+ "New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia",
+ "Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq",
+ "Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador",
+ "Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic",
+ "Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia",
+ "Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen",
+ "Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador",
+ "Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania",
+ "Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia",
+ "Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia",
+ "Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia",
+ "Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia",
+ "Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia",
+ "Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius",
+ "Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin",
+ "alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova",
+ "Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados",
+ "Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo",
+ "Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey",
+ "Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea",
+ "Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland",
+ "Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize",
+ "Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana",
+ "Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia",
+ "Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada",
+ "Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor",
+ "Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau",
+ "American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands",
+ "Palau", "Marshall Islands", "S�o Tom� and Pr�ncipe", "Anguilla",
+ "Kiribati", "Tuvalu", "Niue" };
+
+ private int sortRange;
+ private Random r;
+
+ public override DocData GetNextDocData(DocData docData)
+ {
+ docData = base.GetNextDocData(docData);
+ var props = new Dictionary<string, string>();
+
+ // random int
+ props["sort_field"] = r.Next(sortRange).ToString(CultureInfo.InvariantCulture);
+
+ // random string
+ int len = NextInt32(2, 20);
+ char[] buffer = new char[len];
+ for (int i = 0; i < len; i++)
+ {
+ buffer[i] = (char)r.Next(0x80);
+ }
+ props["random_string"] = new string(buffer);
+
+ // random country
+ props["country"] = COUNTRIES[r.Next(COUNTRIES.Length)];
+ docData.Props = props;
+ return docData;
+ }
+
+ private int NextInt32(int start, int end)
+ {
+ return start + r.Next(end - start);
+ }
+
+ public override void SetConfig(Config config)
+ {
+ base.SetConfig(config);
+ sortRange = config.Get("sort.rng", 20000);
+ r = new Random(config.Get("rand.seed", 13));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs
new file mode 100644
index 0000000..7879cd8
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs
@@ -0,0 +1,249 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Documents;
+using Lucene.Net.Spatial;
+using Lucene.Net.Spatial.Prefix;
+using Lucene.Net.Spatial.Prefix.Tree;
+using Lucene.Net.Support;
+using Spatial4n.Core.Context;
+using Spatial4n.Core.Shapes;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Indexes spatial data according to a configured <see cref="SpatialStrategy"/> with optional
+ /// shape transformation via a configured <see cref="IShapeConverter"/>. The converter can turn points into
+ /// circles and bounding boxes, in order to vary the type of indexing performance tests.
+ /// Unless it's subclass-ed to do otherwise, this class configures a <see cref="SpatialContext"/>,
+ /// <see cref="SpatialPrefixTree"/>, and <see cref="RecursivePrefixTreeStrategy"/>. The Strategy is made
+ /// available to a query maker via the static method <see cref="GetSpatialStrategy(int)"/>.
+ /// See spatial.alg for a listing of spatial parameters, in particular those starting with "spatial."
+ /// and "doc.spatial".
+ /// </summary>
+ public class SpatialDocMaker : DocMaker
+ {
+ public static readonly string SPATIAL_FIELD = "spatial";
+
+ //cache spatialStrategy by round number
+ private static IDictionary<int, SpatialStrategy> spatialStrategyCache = new Dictionary<int, SpatialStrategy>();
+
+ private SpatialStrategy strategy;
+ private IShapeConverter shapeConverter;
+
+ /// <summary>
+ /// Looks up the <see cref="SpatialStrategy"/> from the given round --
+ /// <see cref="Config.RoundNumber"/>. It's an error
+ /// if it wasn't created already for this round -- when <see cref="SpatialDocMaker"/> is initialized.
+ /// </summary>
+ public static SpatialStrategy GetSpatialStrategy(int roundNumber)
+ {
+ SpatialStrategy result;
+ if (!spatialStrategyCache.TryGetValue(roundNumber, out result) || result == null)
+ {
+ throw new InvalidOperationException("Strategy should have been init'ed by SpatialDocMaker by now");
+ }
+ return result;
+ }
+
+ /// <summary>
+ /// Builds a <see cref="SpatialStrategy"/> from configuration options.
+ /// </summary>
+ protected virtual SpatialStrategy MakeSpatialStrategy(Config config)
+ {
+ //A Map view of Config that prefixes keys with "spatial."
+ var configMap = new DictionaryAnonymousHelper(config);
+
+ SpatialContext ctx = SpatialContextFactory.MakeSpatialContext(configMap /*, null*/); // LUCENENET TODO: What is this extra param?
+
+ //Some day the strategy might be initialized with a factory but such a factory
+ // is non-existent.
+ return MakeSpatialStrategy(config, configMap, ctx);
+ }
+
+ private class DictionaryAnonymousHelper : Dictionary<string, string>
+ {
+ private readonly Config config;
+ public DictionaryAnonymousHelper(Config config)
+ {
+ this.config = config;
+ }
+
+ // LUCENENET TODO: EntrySet not supported. Should we throw on GetEnumerator()?
+
+ new public string this[string key]
+ {
+ get { return config.Get("spatial." + key, null); }
+ }
+ }
+
+ protected virtual SpatialStrategy MakeSpatialStrategy(Config config, IDictionary<string, string> configMap,
+ SpatialContext ctx)
+ {
+ //A factory for the prefix tree grid
+ SpatialPrefixTree grid = SpatialPrefixTreeFactory.MakeSPT(configMap, /*null,*/ ctx); // LUCENENET TODO: What is this extra param?
+
+ RecursivePrefixTreeStrategy strategy = new RecursivePrefixTreeStrategyAnonymousHelper(grid, SPATIAL_FIELD, config);
+
+ int prefixGridScanLevel = config.Get("query.spatial.prefixGridScanLevel", -4);
+ if (prefixGridScanLevel < 0)
+ prefixGridScanLevel = grid.MaxLevels + prefixGridScanLevel;
+ strategy.PrefixGridScanLevel = prefixGridScanLevel;
+
+ double distErrPct = config.Get("spatial.distErrPct", .025);//doc & query; a default
+ strategy.DistErrPct = distErrPct;
+ return strategy;
+ }
+
+ private class RecursivePrefixTreeStrategyAnonymousHelper : RecursivePrefixTreeStrategy
+ {
+ public RecursivePrefixTreeStrategyAnonymousHelper(SpatialPrefixTree grid, string fieldName, Config config)
+ : base(grid, fieldName)
+ {
+ this.m_pointsOnly = config.Get("spatial.docPointsOnly", false);
+ }
+ }
+
+ public override void SetConfig(Config config, ContentSource source)
+ {
+ base.SetConfig(config, source);
+ SpatialStrategy existing;
+ if (!spatialStrategyCache.TryGetValue(config.RoundNumber, out existing) || existing == null)
+ {
+ //new round; we need to re-initialize
+ strategy = MakeSpatialStrategy(config);
+ spatialStrategyCache[config.RoundNumber] = strategy;
+ //TODO remove previous round config?
+ shapeConverter = MakeShapeConverter(strategy, config, "doc.spatial.");
+ SystemConsole.WriteLine("Spatial Strategy: " + strategy);
+ }
+ }
+
+ /// <summary>
+ /// Optionally converts points to circles, and optionally bbox'es result.
+ /// </summary>
+ public static IShapeConverter MakeShapeConverter(SpatialStrategy spatialStrategy,
+ Config config, string configKeyPrefix)
+ {
+ //by default does no conversion
+ double radiusDegrees = config.Get(configKeyPrefix + "radiusDegrees", 0.0);
+ double plusMinus = config.Get(configKeyPrefix + "radiusDegreesRandPlusMinus", 0.0);
+ bool bbox = config.Get(configKeyPrefix + "bbox", false);
+
+ return new ShapeConverterAnonymousHelper(spatialStrategy, radiusDegrees, plusMinus, bbox);
+ }
+
+ private class ShapeConverterAnonymousHelper : IShapeConverter
+ {
+ private readonly SpatialStrategy spatialStrategy;
+ private readonly double radiusDegrees;
+ private readonly double plusMinus;
+ private readonly bool bbox;
+
+ public ShapeConverterAnonymousHelper(SpatialStrategy spatialStrategy, double radiusDegrees, double plusMinus, bool bbox)
+ {
+ this.spatialStrategy = spatialStrategy;
+ this.radiusDegrees = radiusDegrees;
+ this.plusMinus = plusMinus;
+ this.bbox = bbox;
+ }
+
+ public IShape Convert(IShape shape)
+ {
+ if (shape is IPoint && (radiusDegrees != 0.0 || plusMinus != 0.0))
+ {
+ IPoint point = (IPoint)shape;
+ double radius = radiusDegrees;
+ if (plusMinus > 0.0)
+ {
+ Random random = new Random(point.GetHashCode());//use hashCode so it's reproducibly random
+ radius += random.NextDouble() * 2 * plusMinus - plusMinus;
+ radius = Math.Abs(radius);//can happen if configured plusMinus > radiusDegrees
+ }
+ shape = spatialStrategy.SpatialContext.MakeCircle(point, radius);
+ }
+ if (bbox)
+ {
+ shape = shape.BoundingBox;
+ }
+ return shape;
+ }
+ }
+
+ // LUCENENET specific: de-nested IShapeConverter
+
+ public override Document MakeDocument()
+ {
+
+ DocState docState = GetDocState();
+
+ Document doc = base.MakeDocument();
+
+ // Set SPATIAL_FIELD from body
+ DocData docData = docState.docData;
+ // makeDocument() resets docState.getBody() so we can't look there; look in Document
+ string shapeStr = doc.GetField(DocMaker.BODY_FIELD).GetStringValue();
+ IShape shape = MakeShapeFromString(strategy, docData.Name, shapeStr);
+ if (shape != null)
+ {
+ shape = shapeConverter.Convert(shape);
+ //index
+ foreach (Field f in strategy.CreateIndexableFields(shape))
+ {
+ doc.Add(f);
+ }
+ }
+
+ return doc;
+ }
+
+ public static IShape MakeShapeFromString(SpatialStrategy strategy, string name, string shapeStr)
+ {
+ if (shapeStr != null && shapeStr.Length > 0)
+ {
+ try
+ {
+ return strategy.SpatialContext.ReadShapeFromWkt(shapeStr);
+ }
+ catch (Exception e)
+ {//InvalidShapeException TODO
+ SystemConsole.Error.WriteLine("Shape " + name + " wasn't parseable: " + e + " (skipping it)");
+ return null;
+ }
+ }
+ return null;
+ }
+
+ public override Document MakeDocument(int size)
+ {
+ //TODO consider abusing the 'size' notion to number of shapes per document
+ throw new NotSupportedException();
+ }
+ }
+
+ /// <summary>
+ /// Converts one shape to another. Created by
+ /// <see cref="MakeShapeConverter(SpatialStrategy, Config, string)"/>.
+ /// </summary>
+ public interface IShapeConverter
+ {
+ IShape Convert(IShape shape);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs
new file mode 100644
index 0000000..d583d22
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs
@@ -0,0 +1,131 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Queries;
+using Lucene.Net.Queries.Function;
+using Lucene.Net.Search;
+using Lucene.Net.Spatial;
+using Lucene.Net.Spatial.Queries;
+using Spatial4n.Core.Shapes;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Reads spatial data from the body field docs from an internally created <see cref="LineDocSource"/>.
+ /// It's parsed by <see cref="Spatial4n.Core.Context.SpatialContext.ReadShapeFromWkt(string)"/> and then
+ /// further manipulated via a configurable <see cref="IShapeConverter"/>. When using point
+ /// data, it's likely you'll want to configure the shape converter so that the query shapes actually
+ /// cover a region. The queries are all created & cached in advance. This query maker works in
+ /// conjunction with <see cref="SpatialDocMaker"/>. See spatial.alg for a listing of options, in
+ /// particular the options starting with "query.".
+ /// </summary>
+ public class SpatialFileQueryMaker : AbstractQueryMaker
+ {
+ protected SpatialStrategy m_strategy;
+ protected double m_distErrPct;//NaN if not set
+ protected SpatialOperation m_operation;
+ protected bool m_score;
+
+ protected IShapeConverter m_shapeConverter;
+
+ public override void SetConfig(Config config)
+ {
+ m_strategy = SpatialDocMaker.GetSpatialStrategy(config.RoundNumber);
+ m_shapeConverter = SpatialDocMaker.MakeShapeConverter(m_strategy, config, "query.spatial.");
+
+ m_distErrPct = config.Get("query.spatial.distErrPct", double.NaN);
+ m_operation = SpatialOperation.Get(config.Get("query.spatial.predicate", "Intersects"));
+ m_score = config.Get("query.spatial.score", false);
+
+ base.SetConfig(config);//call last, will call prepareQueries()
+ }
+
+ protected override Query[] PrepareQueries()
+ {
+ int maxQueries = m_config.Get("query.file.maxQueries", 1000);
+ Config srcConfig = new Config(new Dictionary<string, string>());
+ srcConfig.Set("docs.file", m_config.Get("query.file", null));
+ srcConfig.Set("line.parser", m_config.Get("query.file.line.parser", null));
+ srcConfig.Set("content.source.forever", "false");
+
+ List<Query> queries = new List<Query>();
+ LineDocSource src = new LineDocSource();
+ try
+ {
+ src.SetConfig(srcConfig);
+ src.ResetInputs();
+ DocData docData = new DocData();
+ for (int i = 0; i < maxQueries; i++)
+ {
+ docData = src.GetNextDocData(docData);
+ IShape shape = SpatialDocMaker.MakeShapeFromString(m_strategy, docData.Name, docData.Body);
+ if (shape != null)
+ {
+ shape = m_shapeConverter.Convert(shape);
+ queries.Add(MakeQueryFromShape(shape));
+ }
+ else
+ {
+ i--;//skip
+ }
+ }
+ }
+#pragma warning disable 168
+ catch (NoMoreDataException e)
+#pragma warning restore 168
+ {
+ //all-done
+ }
+ finally
+ {
+ src.Dispose();
+ }
+ return queries.ToArray();
+ }
+
+
+ protected virtual Query MakeQueryFromShape(IShape shape)
+ {
+ SpatialArgs args = new SpatialArgs(m_operation, shape);
+ if (!double.IsNaN(m_distErrPct))
+ args.DistErrPct = m_distErrPct;
+
+ if (m_score)
+ {
+ ValueSource valueSource = m_strategy.MakeDistanceValueSource(shape.Center);
+ return new CustomScoreQuery(m_strategy.MakeQuery(args), new FunctionQuery(valueSource));
+ }
+ else
+ {
+ //strategy.makeQuery() could potentially score (isn't well defined) so instead we call
+ // makeFilter() and wrap
+
+ Filter filter = m_strategy.MakeFilter(args);
+ if (filter is QueryWrapperFilter)
+ {
+ return ((QueryWrapperFilter)filter).Query;
+ }
+ else
+ {
+ return new ConstantScoreQuery(filter);
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs
new file mode 100644
index 0000000..d84a25d
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs
@@ -0,0 +1,350 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Text;
+using System.Threading;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Implements a <see cref="ContentSource"/> over the TREC collection.
+ /// </summary>
+ /// <remarks>
+ /// Supports the following configuration parameters (on top of
+ /// <see cref="ContentSource"/>):
+ /// <list type="bullet">
+ /// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir"
+ /// denotes a relative path (<b>default=work</b>).</description></item>
+ /// <item><term>docs.dir</term><description>specifies the directory where the TREC files reside.
+ /// Can be set to a relative path if "work.dir" is also specified
+ /// (<b>default=trec</b>).
+ /// </description></item>
+ /// <item><term>trec.doc.parser</term><description>specifies the <see cref="TrecDocParser"/> class to use for
+ /// parsing the TREC documents content (<b>default=TrecGov2Parser</b>).
+ /// </description></item>
+ /// <item><term>html.parser</term><description>specifies the <see cref="IHTMLParser"/> class to use for
+ /// parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>).
+ /// </description></item>
+ /// <item><term>content.source.encoding</term><description>if not specified, ISO-8859-1 is used.</description></item>
+ /// <item>content.source.excludeIteration<term></term><description>if <c>true</c>, do not append iteration number to docname</description></item>
+ /// </list>
+ /// </remarks>
+ public class TrecContentSource : ContentSource
+ {
+ // LUCENENET specific - DateFormatInfo not used
+
+ public static readonly string DOCNO = "<DOCNO>";
+ public static readonly string TERMINATING_DOCNO = "</DOCNO>";
+ public static readonly string DOC = "<DOC>";
+ public static readonly string TERMINATING_DOC = "</DOC>";
+
+ /// <summary>separator between lines in the buffer</summary>
+ public static readonly string NEW_LINE = Environment.NewLine;
+
+ private static readonly string[] DATE_FORMATS = {
+ // LUCENENET specific: in JAVA, they don't care if it is an abbreviated or a full month name when parsing
+ // so we provide definitions for both ways.
+ "ddd, dd MMM yyyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT
+ "ddd, dd MMMM yyyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT
+ "ddd MMM dd hh:mm:ss yyyy K", // Tue Dec 09 16:45:08 2003 EST
+ "ddd MMMM dd hh:mm:ss yyyy K", // Tue December 09 16:45:08 2003 EST
+ "ddd, dd-MMM-':'y hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT
+ "ddd, dd-MMMM-':'y hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT
+ "ddd, dd-MMM-yyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT
+ "ddd, dd-MMMM-yyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT
+ "ddd MMM dd hh:mm:ss yyyy", // Tue Dec 09 16:45:08 2003
+ "ddd MMMM dd hh:mm:ss yyyy", // Tue December 09 16:45:08 2003
+ "dd MMM yyyy", // 1 Mar 1994
+ "dd MMMM yyyy", // 1 March 1994
+ "MMM dd, yyyy", // Feb 3, 1994
+ "MMMM dd, yyyy", // February 3, 1994
+ "yyMMdd", // 910513
+ "hhmm K.K.K. MMM dd, yyyy", // 0901 u.t.c. Apr 28, 1994
+ "hhmm K.K.K. MMMM dd, yyyy", // 0901 u.t.c. April 28, 1994
+ };
+
+ private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
+ private DirectoryInfo dataDir = null;
+ private List<FileInfo> inputFiles = new List<FileInfo>();
+ private int nextFile = 0;
+ // Use to synchronize threads on reading from the TREC documents.
+ private object @lock = new object();
+
+ // Required for test
+ internal TextReader reader;
+ internal int iteration = 0;
+ internal IHTMLParser htmlParser;
+
+ private bool excludeDocnameIteration;
+ private TrecDocParser trecDocParser = new TrecGov2Parser(); // default
+ internal TrecDocParser.ParsePathType currPathType; // not private for tests
+
+ private StringBuilder GetDocBuffer()
+ {
+ StringBuilder sb = trecDocBuffer.Value;
+ if (sb == null)
+ {
+ sb = new StringBuilder();
+ trecDocBuffer.Value = sb;
+ }
+ return sb;
+ }
+
+ internal IHTMLParser HtmlParser
+ {
+ get { return htmlParser; }
+ }
+
+ /// <summary>
+ /// Read until a line starting with the specified <paramref name="lineStart"/>.
+ /// </summary>
+ /// <param name="buf">Buffer for collecting the data if so specified.</param>
+ /// <param name="lineStart">Line start to look for, must not be <c>null</c>.</param>
+ /// <param name="collectMatchLine">Whether to collect the matching line into <c>buffer</c>.</param>
+ /// <param name="collectAll">Whether to collect all lines into <c>buffer</c>.</param>
+ /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+ /// <exception cref="NoMoreDataException">If the source is exhausted.</exception>
+ private void Read(StringBuilder buf, string lineStart,
+ bool collectMatchLine, bool collectAll)
+ {
+ string sep = "";
+ while (true)
+ {
+ string line = reader.ReadLine();
+
+ if (line == null)
+ {
+ OpenNextFile();
+ continue;
+ }
+
+ var _ = line.Length;
+
+ if (lineStart != null && line.StartsWith(lineStart, StringComparison.Ordinal))
+ {
+ if (collectMatchLine)
+ {
+ buf.Append(sep).Append(line);
+ sep = NEW_LINE;
+ }
+ return;
+ }
+
+ if (collectAll)
+ {
+ buf.Append(sep).Append(line);
+ sep = NEW_LINE;
+ }
+ }
+ }
+
+ internal virtual void OpenNextFile()
+ {
+ Dispose();
+ //currPathType = null;
+ while (true)
+ {
+ if (nextFile >= inputFiles.Count)
+ {
+ // exhausted files, start a new round, unless forever set to false.
+ if (!m_forever)
+ {
+ throw new NoMoreDataException();
+ }
+ nextFile = 0;
+ iteration++;
+ }
+ FileInfo f = inputFiles[nextFile++];
+ if (m_verbose)
+ {
+ SystemConsole.WriteLine("opening: " + f + " length: " + f.Length);
+ }
+ try
+ {
+ Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension
+ reader = new StreamReader(inputStream, m_encoding);
+ currPathType = TrecDocParser.PathType(f);
+ return;
+ }
+ catch (Exception e)
+ {
+ if (m_verbose)
+ {
+ SystemConsole.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message);
+ continue;
+ }
+ throw new NoMoreDataException();
+ }
+ }
+ }
+
+ public virtual DateTime? ParseDate(string dateStr)
+ {
+ dateStr = dateStr.Trim();
+ DateTime d;
+ if (DateTime.TryParseExact(dateStr, DATE_FORMATS, CultureInfo.InvariantCulture, DateTimeStyles.None, out d))
+ {
+ return d;
+ }
+ else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out d))
+ {
+ return d;
+ }
+
+ // do not fail test just because a date could not be parsed
+ if (m_verbose)
+ {
+ SystemConsole.WriteLine("failed to parse date (assigning 'now') for: " + dateStr);
+ }
+ return null;
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (reader == null)
+ {
+ return;
+ }
+
+ try
+ {
+ reader.Dispose();
+ }
+ catch (IOException e)
+ {
+ if (m_verbose)
+ {
+ SystemConsole.WriteLine("failed to dispose reader !");
+ SystemConsole.WriteLine(e.ToString());
+ }
+ }
+ reader = null;
+ }
+
+ public override DocData GetNextDocData(DocData docData)
+ {
+ string name = null;
+ StringBuilder docBuf = GetDocBuffer();
+ TrecDocParser.ParsePathType parsedPathType;
+
+ // protect reading from the TREC files by multiple threads. The rest of the
+ // method, i.e., parsing the content and returning the DocData can run unprotected.
+ lock (@lock)
+ {
+ if (reader == null)
+ {
+ OpenNextFile();
+ }
+
+ // 1. skip until doc start - required for all TREC formats
+ docBuf.Length = 0;
+ Read(docBuf, DOC, false, false);
+
+ // save parsedFile for passing trecDataParser after the sync block, in
+ // case another thread will open another file in between.
+ parsedPathType = currPathType;
+
+ // 2. name - required for all TREC formats
+ docBuf.Length = 0;
+ Read(docBuf, DOCNO, true, false);
+ name = docBuf.ToString(DOCNO.Length, docBuf.IndexOf(TERMINATING_DOCNO,
+ DOCNO.Length) - DOCNO.Length).Trim();
+
+ if (!excludeDocnameIteration)
+ {
+ name = name + "_" + iteration;
+ }
+
+ // 3. read all until end of doc
+ docBuf.Length = 0;
+ Read(docBuf, TERMINATING_DOC, false, true);
+ }
+
+ // count char length of text to be parsed (may be larger than the resulted plain doc body text).
+ AddBytes(docBuf.Length);
+
+ // This code segment relies on HtmlParser being thread safe. When we get
+ // here, everything else is already private to that thread, so we're safe.
+ docData = trecDocParser.Parse(docData, name, this, docBuf, parsedPathType);
+ AddItem();
+
+ return docData;
+ }
+
+ public override void ResetInputs()
+ {
+ lock (@lock)
+ {
+ base.ResetInputs();
+ Dispose();
+ nextFile = 0;
+ iteration = 0;
+ }
+ }
+
+ public override void SetConfig(Config config)
+ {
+ base.SetConfig(config);
+ // dirs
+ DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
+ string d = config.Get("docs.dir", "trec");
+ dataDir = new DirectoryInfo(d);
+ // files
+ CollectFiles(dataDir, inputFiles);
+ if (inputFiles.Count == 0)
+ {
+ throw new ArgumentException("No files in dataDir: " + dataDir);
+ }
+ // trec doc parser
+ try
+ {
+ string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark");
+ trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName));
+ }
+ catch (Exception e)
+ {
+ // Should not get here. Throw runtime exception.
+ throw new Exception(e.ToString(), e);
+ }
+ // html parser
+ try
+ {
+ string htmlParserClassName = config.Get("html.parser",
+ "Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark");
+ htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName));
+ }
+ catch (Exception e)
+ {
+ // Should not get here. Throw runtime exception.
+ throw new Exception(e.ToString(), e);
+ }
+ // encoding
+ if (m_encoding == null)
+ {
+ m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name();
+ }
+ // iteration exclusion in doc name
+ excludeDocnameIteration = config.Get("content.source.excludeIteration", false);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs
new file mode 100644
index 0000000..b67a1c0
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs
@@ -0,0 +1,159 @@
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
+ /// which are handled in TrecContentSource. Required to be stateless and hence thread safe.
+ /// </summary>
+ public abstract class TrecDocParser
+ {
+ /// <summary>Types of trec parse paths,</summary>
+ public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES, UNKNOWN }
+
+ /// <summary>trec parser type used for unknown extensions</summary>
+ public static readonly ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
+
+ internal static readonly IDictionary<ParsePathType, TrecDocParser> pathType2parser = new Dictionary<ParsePathType, TrecDocParser>();
+ static TrecDocParser()
+ {
+ pathType2parser[ParsePathType.GOV2] = new TrecGov2Parser();
+ pathType2parser[ParsePathType.FBIS] = new TrecFBISParser();
+ pathType2parser[ParsePathType.FR94] = new TrecFR94Parser();
+ pathType2parser[ParsePathType.FT] = new TrecFTParser();
+ pathType2parser[ParsePathType.LATIMES] = new TrecLATimesParser();
+
+ foreach (ParsePathType ppt in Enum.GetValues(typeof(ParsePathType)))
+ {
+ pathName2Type[ppt.ToString().ToUpperInvariant()] = ppt;
+ }
+ }
+
+ internal static readonly IDictionary<string, ParsePathType?> pathName2Type = new Dictionary<string, ParsePathType?>();
+
+
+ /// <summary>max length of walk up from file to its ancestors when looking for a known path type.</summary>
+ private static readonly int MAX_PATH_LENGTH = 10;
+
+ /// <summary>
+ /// Compute the path type of a file by inspecting name of file and its parents.
+ /// </summary>
+ public static ParsePathType PathType(FileInfo f)
+ {
+ int pathLength = 0;
+ ParsePathType? ppt;
+ if (pathName2Type.TryGetValue(f.Name.ToUpperInvariant(), out ppt) && ppt != null)
+ {
+ return ppt.Value;
+ }
+ // Walk up the directory names to find a match.
+ DirectoryInfo parentDir = f.Directory;
+ while (parentDir != null && ++pathLength < MAX_PATH_LENGTH)
+ {
+ if (pathName2Type.TryGetValue(parentDir.Name.ToUpperInvariant(), out ppt) && ppt != null)
+ {
+ return ppt.Value;
+ }
+ parentDir = parentDir.Parent;
+ }
+ return DEFAULT_PATH_TYPE;
+ }
+
+ /// <summary>
+ /// Parse the text prepared in docBuf into a result DocData,
+ /// no synchronization is required.
+ /// </summary>
+ /// <param name="docData">Reusable result.</param>
+ /// <param name="name">Name that should be set to the result.</param>
+ /// <param name="trecSrc">Calling trec content source.</param>
+ /// <param name="docBuf">Text to parse.</param>
+ /// <param name="pathType">Type of parsed file, or <see cref="ParsePathType.UNKNOWN"/> if unknown - may be used by
+ /// parsers to alter their behavior according to the file path type. </param>
+ /// <returns></returns>
+ public abstract DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType);
+
+ /// <summary>
+ /// strip tags from <code>buf</code>: each tag is replaced by a single blank.
+ /// </summary>
+ /// <returns>Text obtained when stripping all tags from <paramref name="buf"/> (input <see cref="StringBuilder"/> is unmodified).</returns>
+ public static string StripTags(StringBuilder buf, int start)
+ {
+ return StripTags(buf.ToString(start, buf.Length - start), 0);
+ }
+
+ /// <summary>
+ /// Strip tags from input.
+ /// </summary>
+ /// <seealso cref="StripTags(StringBuilder, int)"/>
+ public static string StripTags(string buf, int start)
+ {
+ if (start > 0)
+ {
+ buf = buf.Substring(0);
+ }
+ return Regex.Replace(buf, "<[^>]*>", " ");
+ }
+
+ /// <summary>
+ /// Extract from <paramref name="buf"/> the text of interest within specified tags.
+ /// </summary>
+ /// <param name="buf">Entire input text.</param>
+ /// <param name="startTag">Tag marking start of text of interest.</param>
+ /// <param name="endTag">Tag marking end of text of interest.</param>
+ /// <param name="maxPos">if ≥ 0 sets a limit on start of text of interest.</param>
+ /// <param name="noisePrefixes">Text of interest or null if not found.</param>
+ /// <returns></returns>
+ public static string Extract(StringBuilder buf, string startTag, string endTag, int maxPos, string[] noisePrefixes)
+ {
+ int k1 = buf.IndexOf(startTag);
+ if (k1 >= 0 && (maxPos < 0 || k1 < maxPos))
+ {
+ k1 += startTag.Length;
+ int k2 = buf.IndexOf(endTag, k1);
+ if (k2 >= 0 && (maxPos < 0 || k2 < maxPos))
+ { // found end tag with allowed range
+ if (noisePrefixes != null)
+ {
+ foreach (string noise in noisePrefixes)
+ {
+ int k1a = buf.IndexOf(noise, k1);
+ if (k1a >= 0 && k1a < k2)
+ {
+ k1 = k1a + noise.Length;
+ }
+ }
+ }
+ return buf.ToString(k1, k2 - k1).Trim();
+ }
+ }
+ return null;
+ }
+
+ //public static void main(String[] args) {
+ // System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
+ //}
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs
new file mode 100644
index 0000000..cf321cc
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs
@@ -0,0 +1,68 @@
+using Lucene.Net.Support;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for the FBIS docs in trec disks 4+5 collection format
+ /// </summary>
+ public class TrecFBISParser : TrecDocParser
+ {
+ private static readonly string HEADER = "<HEADER>";
+ private static readonly string HEADER_END = "</HEADER>";
+ private static readonly int HEADER_END_LENGTH = HEADER_END.Length;
+
+ private static readonly string DATE1 = "<DATE1>";
+ private static readonly string DATE1_END = "</DATE1>";
+
+ private static readonly string TI = "<TI>";
+ private static readonly string TI_END = "</TI>";
+
+ public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType)
+ {
+ int mark = 0; // that much is skipped
+ // optionally skip some of the text, set date, title
+ DateTime? date = null;
+ string title = null;
+ int h1 = docBuf.IndexOf(HEADER);
+ if (h1 >= 0)
+ {
+ int h2 = docBuf.IndexOf(HEADER_END, h1);
+ mark = h2 + HEADER_END_LENGTH;
+ // date...
+ string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null);
+ if (dateStr != null)
+ {
+ date = trecSrc.ParseDate(dateStr);
+ }
+ // title...
+ title = Extract(docBuf, TI, TI_END, h2, null);
+ }
+ docData.Clear();
+ docData.Name = name;
+ docData.SetDate(date);
+ docData.Title = title;
+ docData.Body = StripTags(docBuf, mark).ToString();
+ return docData;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs
new file mode 100644
index 0000000..72f99bb
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs
@@ -0,0 +1,69 @@
+using Lucene.Net.Support;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for the FR94 docs in trec disks 4+5 collection format
+ /// </summary>
+ public class TrecFR94Parser : TrecDocParser
+ {
+ private static readonly string TEXT = "<TEXT>";
+ private static readonly int TEXT_LENGTH = TEXT.Length;
+ private static readonly string TEXT_END = "</TEXT>";
+
+ private static readonly string DATE = "<DATE>";
+ private static readonly string[] DATE_NOISE_PREFIXES = {
+ "DATE:",
+ "date:", //TODO improve date extraction for this format
+ "t.c.",
+ };
+ private static readonly string DATE_END = "</DATE>";
+
+ //TODO can we also extract title for this format?
+
+ public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType)
+ {
+ int mark = 0; // that much is skipped
+ // optionally skip some of the text, set date (no title?)
+ DateTime? date = null;
+ int h1 = docBuf.IndexOf(TEXT);
+ if (h1 >= 0)
+ {
+ int h2 = docBuf.IndexOf(TEXT_END, h1);
+ mark = h1 + TEXT_LENGTH;
+ // date...
+ string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
+ if (dateStr != null)
+ {
+ dateStr = StripTags(dateStr, 0).ToString();
+ date = trecSrc.ParseDate(dateStr.Trim());
+ }
+ }
+ docData.Clear();
+ docData.Name = name;
+ docData.SetDate(date);
+ docData.Body = StripTags(docBuf, mark).ToString();
+ return docData;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs
new file mode 100644
index 0000000..189f6cb
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs
@@ -0,0 +1,58 @@
+using System;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for the FT docs in trec disks 4+5 collection format
+ /// </summary>
+ public class TrecFTParser : TrecDocParser
+ {
+ private static readonly string DATE = "<DATE>";
+ private static readonly string DATE_END = "</DATE>";
+
+ private static readonly string HEADLINE = "<HEADLINE>";
+ private static readonly string HEADLINE_END = "</HEADLINE>";
+
+ public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType)
+ {
+ int mark = 0; // that much is skipped
+
+ // date...
+ DateTime? date = null;
+ string dateStr = Extract(docBuf, DATE, DATE_END, -1, null);
+ if (dateStr != null)
+ {
+ date = trecSrc.ParseDate(dateStr);
+ }
+
+ // title...
+ string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
+
+ docData.Clear();
+ docData.Name = name;
+ docData.SetDate(date);
+ docData.Title = title;
+ docData.Body = StripTags(docBuf, mark).ToString();
+ return docData;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs
new file mode 100644
index 0000000..12912e9
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs
@@ -0,0 +1,57 @@
+using Lucene.Net.Support;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for the GOV2 collection format
+ /// </summary>
+ public class TrecGov2Parser : TrecDocParser
+ {
+ private static readonly string DATE = "Date: ";
+ private static readonly string DATE_END = TrecContentSource.NEW_LINE;
+
+ private static readonly string DOCHDR = "<DOCHDR>";
+ private static readonly string TERMINATING_DOCHDR = "</DOCHDR>";
+
+ public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType)
+ {
+ // skip some of the non-html text, optionally set date
+ DateTime? date = null;
+ int start = 0;
+ int h1 = docBuf.IndexOf(DOCHDR);
+ if (h1 >= 0)
+ {
+ int h2 = docBuf.IndexOf(TERMINATING_DOCHDR, h1);
+ string dateStr = Extract(docBuf, DATE, DATE_END, h2, null);
+ if (dateStr != null)
+ {
+ date = trecSrc.ParseDate(dateStr);
+ }
+ start = h2 + TERMINATING_DOCHDR.Length;
+ }
+ string html = docBuf.ToString(start, docBuf.Length - start);
+ return trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs
new file mode 100644
index 0000000..e54f635
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs
@@ -0,0 +1,75 @@
+using System;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for the FT docs in trec disks 4+5 collection format
+ /// </summary>
+ public class TrecLATimesParser : TrecDocParser
+ {
+ private static readonly string DATE = "<DATE>";
+ private static readonly string DATE_END = "</DATE>";
+ private static readonly string DATE_NOISE = "day,"; // anything aftre the ','
+
+ private static readonly string SUBJECT = "<SUBJECT>";
+ private static readonly string SUBJECT_END = "</SUBJECT>";
+ private static readonly string HEADLINE = "<HEADLINE>";
+ private static readonly string HEADLINE_END = "</HEADLINE>";
+
+ public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType)
+ {
+ int mark = 0; // that much is skipped
+
+ // date...
+ DateTime? date = null;
+ string dateStr = Extract(docBuf, DATE, DATE_END, -1, null);
+ if (dateStr != null)
+ {
+ int d2a = dateStr.IndexOf(DATE_NOISE);
+ if (d2a > 0)
+ {
+ dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part
+ }
+ dateStr = StripTags(dateStr, 0).ToString();
+ date = trecSrc.ParseDate(dateStr.Trim());
+ }
+
+ // title... first try with SUBJECT, them with HEADLINE
+ string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
+ if (title == null)
+ {
+ title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
+ }
+ if (title != null)
+ {
+ title = StripTags(title, 0).ToString().Trim();
+ }
+
+ docData.Clear();
+ docData.Name = name;
+ docData.SetDate(date);
+ docData.Title = title;
+ docData.Body = StripTags(docBuf, mark).ToString();
+ return docData;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs
new file mode 100644
index 0000000..45a72b4
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs
@@ -0,0 +1,34 @@
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Parser for trec docs which selects the parser to apply according
+ /// to the source files path, defaulting to <see cref="TrecGov2Parser"/>.
+ /// </summary>
+ public class TrecParserByPath : TrecDocParser
+ {
+ public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
+ StringBuilder docBuf, ParsePathType pathType)
+ {
+ return pathType2parser[pathType].Parse(docData, name, trecSrc, docBuf, pathType);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs b/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs
new file mode 100644
index 0000000..e5b334c
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs
@@ -0,0 +1,490 @@
+using Lucene.Net.Analysis;
+using Lucene.Net.Benchmarks.ByTask.Feeds;
+using Lucene.Net.Benchmarks.ByTask.Stats;
+using Lucene.Net.Benchmarks.ByTask.Tasks;
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+
+namespace Lucene.Net.Benchmarks.ByTask
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Data maintained by a performance test run.
+ /// </summary>
+ /// <remarks>
+ /// Data includes:
+ /// <list type="bullet">
+ /// <item><description>Configuration.</description></item>
+ /// <item><description>Directory, Writer, Reader.</description></item>
+ /// <item><description>Taxonomy Directory, Writer, Reader.</description></item>
+ /// <item><description>DocMaker, FacetSource and a few instances of QueryMaker.</description></item>
+ /// <item><description>Named AnalysisFactories.</description></item>
+ /// <item><description>Analyzer.</description></item>
+ /// <item><description>Statistics data which updated during the run.</description></item>
+ /// </list>
+ /// <para/>
+ /// Config properties:
+ /// <list type="bullet">
+ /// <item><term>work.dir</term><description><path to root of docs and index dirs| Default: work></description></item>
+ /// <item><term>analyzer</term><description><class name for analyzer| Default: StandardAnalyzer></description></item>
+ /// <item><term>doc.maker</term><description><class name for doc-maker| Default: DocMaker></description></item>
+ /// <item><term>facet.source</term><description><class name for facet-source| Default: RandomFacetSource></description></item>
+ /// <item><term>query.maker</term><description><class name for query-maker| Default: SimpleQueryMaker></description></item>
+ /// <item><term>log.queries</term><description><whether queries should be printed| Default: false></description></item>
+ /// <item><term>directory</term><description><type of directory to use for the index| Default: RAMDirectory></description></item>
+ /// <item><term>taxonomy.directory</term><description><type of directory for taxonomy index| Default: RAMDirectory></description></item>
+ /// </list>
+ /// </remarks>
+ public class PerfRunData : IDisposable
+ {
+ private Points points;
+
+ // objects used during performance test run
+ // directory, analyzer, docMaker - created at startup.
+ // reader, writer, searcher - maintained by basic tasks.
+ private Store.Directory directory;
+ private IDictionary<string, AnalyzerFactory> analyzerFactories = new Dictionary<string, AnalyzerFactory>();
+ private Analyzer analyzer;
+ private DocMaker docMaker;
+ private ContentSource contentSource;
+ private FacetSource facetSource;
+ private CultureInfo locale;
+
+ private Store.Directory taxonomyDir;
+ private ITaxonomyWriter taxonomyWriter;
+ private TaxonomyReader taxonomyReader;
+
+ // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
+ private IDictionary<Type, IQueryMaker> readTaskQueryMaker;
+ private Type qmkrClass;
+
+ private DirectoryReader indexReader;
+ private IndexSearcher indexSearcher;
+ private IndexWriter indexWriter;
+ private Config config;
+ private long startTimeMillis;
+
+ private readonly IDictionary<string, object> perfObjects = new Dictionary<string, object>();
+
+ // constructor
+ public PerfRunData(Config config)
+ {
+ this.config = config;
+ // analyzer (default is standard analyzer)
+ analyzer = NewAnalyzerTask.CreateAnalyzer(config.Get("analyzer",
+ "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common"));
+
+ // content source
+ string sourceClass = config.Get("content.source", typeof(SingleDocSource).AssemblyQualifiedName);
+ contentSource = (ContentSource)Activator.CreateInstance(Type.GetType(sourceClass)); //Class.forName(sourceClass).asSubclass(typeof(ContentSource)).newInstance();
+ contentSource.SetConfig(config);
+
+ // doc maker
+ docMaker = (DocMaker)Activator.CreateInstance(Type.GetType(config.Get("doc.maker", typeof(DocMaker).AssemblyQualifiedName))); // "org.apache.lucene.benchmark.byTask.feeds.DocMaker")).asSubclass(DocMaker.class).newInstance();
+ docMaker.SetConfig(config, contentSource);
+ // facet source
+ facetSource = (FacetSource)Activator.CreateInstance(Type.GetType(config.Get("facet.source",
+ typeof(RandomFacetSource).AssemblyQualifiedName))); // "org.apache.lucene.benchmark.byTask.feeds.RandomFacetSource")).asSubclass(FacetSource.class).newInstance();
+ facetSource.SetConfig(config);
+ // query makers
+ readTaskQueryMaker = new Dictionary<Type, IQueryMaker>();
+ qmkrClass = Type.GetType(config.Get("query.maker", typeof(SimpleQueryMaker).AssemblyQualifiedName));
+
+ // index stuff
+ Reinit(false);
+
+ // statistic points
+ points = new Points(config);
+
+ if (bool.Parse(config.Get("log.queries", "false")))
+ {
+ SystemConsole.WriteLine("------------> queries:");
+ SystemConsole.WriteLine(GetQueryMaker(new SearchTask(this)).PrintQueries());
+ }
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ GC.SuppressFinalize(this);
+ }
+
+ protected virtual void Dispose(bool disposing)
+ {
+ if (disposing)
+ {
+ IOUtils.Dispose(indexWriter, indexReader, directory,
+ taxonomyWriter, taxonomyReader, taxonomyDir,
+ docMaker, facetSource, contentSource);
+
+ // close all perf objects that are closeable.
+ List<IDisposable> perfObjectsToClose = new List<IDisposable>();
+ foreach (object obj in perfObjects.Values)
+ {
+ if (obj is IDisposable)
+ {
+ perfObjectsToClose.Add((IDisposable)obj);
+ }
+ }
+ IOUtils.Dispose(perfObjectsToClose);
+ }
+ }
+
+ // clean old stuff, reopen
+ public virtual void Reinit(bool eraseIndex)
+ {
+ // cleanup index
+ IOUtils.Dispose(indexWriter, indexReader, directory);
+ indexWriter = null;
+ indexReader = null;
+
+ IOUtils.Dispose(taxonomyWriter, taxonomyReader, taxonomyDir);
+ taxonomyWriter = null;
+ taxonomyReader = null;
+
+ // directory (default is ram-dir).
+ directory = CreateDirectory(eraseIndex, "index", "directory");
+ taxonomyDir = CreateDirectory(eraseIndex, "taxo", "taxonomy.directory");
+
+ // inputs
+ ResetInputs();
+
+ // release unused stuff
+ GC.Collect();
+
+ // Re-init clock
+ SetStartTimeMillis();
+ }
+
+ private Store.Directory CreateDirectory(bool eraseIndex, string dirName,
+ string dirParam)
+ {
+ if ("FSDirectory".Equals(config.Get(dirParam, "RAMDirectory"), StringComparison.Ordinal))
+ {
+ DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
+ DirectoryInfo indexDir = new DirectoryInfo(System.IO.Path.Combine(workDir.FullName, dirName));
+ if (eraseIndex && indexDir.Exists)
+ {
+ FileUtils.FullyDelete(indexDir);
+ }
+ indexDir.Create();
+ return FSDirectory.Open(indexDir);
+ }
+
+ return new RAMDirectory();
+ }
+
+ /// <summary>
+ /// Returns an object that was previously set by <see cref="SetPerfObject(string, object)"/>.
+ /// </summary>
+ public virtual object GetPerfObject(string key)
+ {
+ lock (this)
+ {
+ object result;
+ perfObjects.TryGetValue(key, out result);
+ return result;
+ }
+ }
+
+ /// <summary>
+ /// Sets an object that is required by <see cref="PerfTask"/>s, keyed by the given
+ /// <paramref name="key"/>. If the object implements <see cref="IDisposable"/>, it will be disposed
+ /// by <see cref="Dispose()"/>.
+ /// </summary>
+ public virtual void SetPerfObject(string key, object obj)
+ {
+ lock (this)
+ {
+ perfObjects[key] = obj;
+ }
+ }
+
+ public virtual long SetStartTimeMillis()
+ {
+ startTimeMillis = Support.Time.CurrentTimeMilliseconds();
+ return startTimeMillis;
+ }
+
+ /// <summary>
+ /// Gets start time in milliseconds.
+ /// </summary>
+ public virtual long StartTimeMillis
+ {
+ get { return startTimeMillis; }
+ }
+
+ /// <summary>
+ /// Gets the points.
+ /// </summary>
+ public virtual Points Points
+ {
+ get { return points; }
+ }
+
+ /// <summary>
+ /// Gets or sets the directory.
+ /// </summary>
+ public virtual Store.Directory Directory
+ {
+ get { return directory; }
+ set { directory = value; }
+ }
+
+ /// <summary>
+ /// Gets the taxonomy directory.
+ /// </summary>
+ public virtual Store.Directory TaxonomyDir
+ {
+ get { return taxonomyDir; }
+ }
+
+ /// <summary>
+ /// Set the taxonomy reader. Takes ownership of that taxonomy reader, that is,
+ /// internally performs taxoReader.IncRef() (If caller no longer needs that
+ /// reader it should DecRef()/Dispose() it after calling this method, otherwise,
+ /// the reader will remain open).
+ /// </summary>
+ /// <param name="taxoReader">The taxonomy reader to set.</param>
+ public virtual void SetTaxonomyReader(TaxonomyReader taxoReader)
+ {
+ lock (this)
+ {
+ if (taxoReader == this.taxonomyReader)
+ {
+ return;
+ }
+ if (taxonomyReader != null)
+ {
+ taxonomyReader.DecRef();
+ }
+
+ if (taxoReader != null)
+ {
+ taxoReader.IncRef();
+ }
+ this.taxonomyReader = taxoReader;
+ }
+ }
+
+ /// <summary>
+ /// Returns the taxonomyReader. NOTE: this returns a
+ /// reference. You must call TaxonomyReader.DecRef() when
+ /// you're done.
+ /// </summary>
+ public virtual TaxonomyReader GetTaxonomyReader()
+ {
+ lock (this)
+ {
+ if (taxonomyReader != null)
+ {
+ taxonomyReader.IncRef();
+ }
+ return taxonomyReader;
+ }
+ }
+
+ /// <summary>
+ /// Gets or sets the taxonomy writer.
+ /// </summary>
+ public virtual ITaxonomyWriter TaxonomyWriter
+ {
+ get { return taxonomyWriter; }
+ set { taxonomyWriter = value; }
+ }
+
+ /// <summary>
+ /// Returns the indexReader. NOTE: this returns a
+ /// reference. You must call IndexReader.DecRef() when
+ /// you're done.
+ /// </summary>
+ public virtual DirectoryReader GetIndexReader()
+ {
+ lock (this)
+ {
+ if (indexReader != null)
+ {
+ indexReader.IncRef();
+ }
+ return indexReader;
+ }
+ }
+
+ /// <summary>
+ /// Returns the indexSearcher. NOTE: this returns
+ /// a reference to the underlying IndexReader. You must
+ /// call IndexReader.DecRef() when you're done.
+ /// </summary>
+ /// <returns></returns>
+ public virtual IndexSearcher GetIndexSearcher()
+ {
+ lock (this)
+ {
+ if (indexReader != null)
+ {
+ indexReader.IncRef();
+ }
+ return indexSearcher;
+ }
+ }
+
+ /// <summary>
+ /// Set the index reader. Takes ownership of that index reader, that is,
+ /// internally performs indexReader.incRef() (If caller no longer needs that
+ /// reader it should decRef()/close() it after calling this method, otherwise,
+ /// the reader will remain open).
+ /// </summary>
+ /// <param name="indexReader">The indexReader to set.</param>
+ public virtual void SetIndexReader(DirectoryReader indexReader)
+ {
+ lock (this)
+ {
+ if (indexReader == this.indexReader)
+ {
+ return;
+ }
+
+ if (this.indexReader != null)
+ {
+ // Release current IR
+ this.indexReader.DecRef();
+ }
+
+ this.indexReader = indexReader;
+ if (indexReader != null)
+ {
+ // Hold reference to new IR
+ indexReader.IncRef();
+ indexSearcher = new IndexSearcher(indexReader);
+ }
+ else
+ {
+ indexSearcher = null;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Gets or sets the indexWriter.
+ /// </summary>
+ public virtual IndexWriter IndexWriter
+ {
+ get { return indexWriter; }
+ set { indexWriter = value; }
+ }
+
+ /// <summary>
+ /// Gets or sets the analyzer.
+ /// </summary>
+ public virtual Analyzer Analyzer
+ {
+ get { return analyzer; }
+ set { analyzer = value; }
+ }
+
+ /// <summary>Gets the <see cref="Feeds.ContentSource"/>.</summary>
+ public virtual ContentSource ContentSource
+ {
+ get { return contentSource; }
+ }
+
+ /// <summary>Returns the <see cref="Feeds.DocMaker"/>.</summary>
+ public virtual DocMaker DocMaker
+ {
+ get { return docMaker; }
+ }
+
+ /// <summary>Gets the <see cref="Feeds.FacetSource"/>.</summary>
+ public virtual FacetSource FacetSource
+ {
+ get { return facetSource; }
+ }
+
+ /// <summary>
+ /// Gets or sets the culture.
+ /// </summary>
+ public virtual CultureInfo Locale // LUCENENET TODO: API Is this really needed since we have on the thread already?
+ {
+ get { return locale; }
+ set { locale = value; }
+ }
+
+ /// <summary>
+ /// Gets the config.
+ /// </summary>
+ public virtual Config Config
+ {
+ get { return config; }
+ }
+
+ public virtual void ResetInputs()
+ {
+ contentSource.ResetInputs();
+ docMaker.ResetInputs();
+ facetSource.ResetInputs();
+ foreach (IQueryMaker queryMaker in readTaskQueryMaker.Values)
+ {
+ queryMaker.ResetInputs();
+ }
+ }
+
+ /// <summary>
+ /// Returns the queryMaker by read task type (class).
+ /// </summary>
+ public virtual IQueryMaker GetQueryMaker(ReadTask readTask)
+ {
+ lock (this)
+ {
+ // mapping the query maker by task class allows extending/adding new search/read tasks
+ // without needing to modify this class.
+ Type readTaskClass = readTask.GetType();
+ IQueryMaker qm;
+ if (!readTaskQueryMaker.TryGetValue(readTaskClass, out qm) || qm == null)
+ {
+ try
+ {
+ //qm = qmkrClass.newInstance();
+ qm = (IQueryMaker)Activator.CreateInstance(qmkrClass);
+ qm.SetConfig(config);
+ }
+ catch (Exception e)
+ {
+ throw new Exception(e.ToString(), e);
+ }
+ readTaskQueryMaker[readTaskClass] = qm;
+ }
+ return qm;
+ }
+ }
+
+ public virtual IDictionary<string, AnalyzerFactory> AnalyzerFactories
+ {
+ get { return analyzerFactories; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs b/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs
new file mode 100644
index 0000000..6b248f6
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs
@@ -0,0 +1,90 @@
+using Lucene.Net.Benchmarks.ByTask.Tasks;
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Support;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Benchmarks.ByTask.Programmatic
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Sample performance test written programmatically - no algorithm file is needed here.
+ /// </summary>
+ public class Sample
+ {
+ public static void Main(string[] args)
+ {
+ var p = InitProps();
+ Config conf = new Config(p);
+ PerfRunData runData = new PerfRunData(conf);
+
+ // 1. top sequence
+ TaskSequence top = new TaskSequence(runData, null, null, false); // top level, not parallel
+
+ // 2. task to create the index
+ CreateIndexTask create = new CreateIndexTask(runData);
+ top.AddTask(create);
+
+ // 3. task seq to add 500 docs (order matters - top to bottom - add seq to top, only then add to seq)
+ TaskSequence seq1 = new TaskSequence(runData, "AddDocs", top, false);
+ seq1.SetRepetitions(500);
+ seq1.SetNoChildReport();
+ top.AddTask(seq1);
+
+ // 4. task to add the doc
+ AddDocTask addDoc = new AddDocTask(runData);
+ //addDoc.setParams("1200"); // doc size limit if supported
+ seq1.AddTask(addDoc); // order matters 9see comment above)
+
+ // 5. task to close the index
+ CloseIndexTask close = new CloseIndexTask(runData);
+ top.AddTask(close);
+
+ // task to report
+ RepSumByNameTask rep = new RepSumByNameTask(runData);
+ top.AddTask(rep);
+
+ // print algorithm
+ SystemConsole.WriteLine(top.ToString());
+
+ // execute
+ top.DoLogic();
+ }
+
+ // Sample programmatic settings. Could also read from file.
+ private static IDictionary<string, string> InitProps()
+ {
+ var p = new Dictionary<string, string>();
+ p["task.max.depth.log"] = "3";
+ p["max.buffered"] = "buf:10:10:100:100:10:10:100:100";
+ p["doc.maker"] = "Lucene.Net.Benchmarks.ByTask.Feeds.ReutersContentSource, Lucene.Net.Benchmark";
+ p["log.step"] = "2000";
+ p["doc.delete.step"] = "8";
+ p["analyzer"] = "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common";
+ p["doc.term.vector"] = "false";
+ p["directory"] = "FSDirectory";
+ p["query.maker"] = "Lucene.Net.Benchmarks.ByTask.Feeds.ReutersQueryMaker, Lucene.Net.Benchmark";
+ p["doc.stored"] = "true";
+ p["docs.dir"] = "reuters-out";
+ p["compound"] = "cmpnd:true:true:true:true:false:false:false:false";
+ p["doc.tokenized"] = "true";
+ p["merge.factor"] = "mrg:10:100:10:100:10:100:10:100";
+ return p;
+ }
+ }
+}