You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/08/06 17:59:10 UTC
[12/33] lucenenet git commit: Ported Lucene.Net.Benchmark + tests
Ported Lucene.Net.Benchmark + tests
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b515271d
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b515271d
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b515271d
Branch: refs/heads/master
Commit: b515271d8821dde3cd980beae780d204fd6b0e5c
Parents: 1e52293
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Mon Jul 31 14:26:48 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Wed Aug 2 09:54:52 2017 +0700
----------------------------------------------------------------------
Lucene.Net.sln | 52 +
src/Lucene.Net.Benchmark/ByTask/Benchmark.cs | 170 +++
.../ByTask/Feeds/AbstractQueryMaker.cs | 85 ++
.../ByTask/Feeds/ContentItemsSource.cs | 227 ++++
.../ByTask/Feeds/ContentSource.cs | 38 +
.../ByTask/Feeds/DemoHTMLParser.cs | 259 ++++
.../ByTask/Feeds/DirContentSource.cs | 259 ++++
.../ByTask/Feeds/DocData.cs | 73 ++
.../ByTask/Feeds/DocMaker.cs | 511 ++++++++
.../ByTask/Feeds/EnwikiContentSource.cs | 394 ++++++
.../ByTask/Feeds/EnwikiQueryMaker.cs | 146 +++
.../ByTask/Feeds/FacetSource.cs | 47 +
.../ByTask/Feeds/FileBasedQueryMaker.cs | 121 ++
.../ByTask/Feeds/GeonamesLineParser.cs | 53 +
.../ByTask/Feeds/HTMLParser.cs | 42 +
.../ByTask/Feeds/LineDocSource.cs | 328 +++++
.../ByTask/Feeds/LongToEnglishContentSource.cs | 72 ++
.../ByTask/Feeds/LongToEnglishQueryMaker.cs | 89 ++
.../ByTask/Feeds/NoMoreDataException.cs | 50 +
.../ByTask/Feeds/QueryMaker.cs | 48 +
.../ByTask/Feeds/RandomFacetSource.cs | 109 ++
.../ByTask/Feeds/ReutersContentSource.cs | 140 +++
.../ByTask/Feeds/ReutersQueryMaker.cs | 126 ++
.../ByTask/Feeds/SimpleQueryMaker.cs | 70 ++
.../Feeds/SimpleSloppyPhraseQueryMaker.cs | 88 ++
.../ByTask/Feeds/SingleDocSource.cs | 77 ++
.../ByTask/Feeds/SortableSingleDocSource.cs | 114 ++
.../ByTask/Feeds/SpatialDocMaker.cs | 249 ++++
.../ByTask/Feeds/SpatialFileQueryMaker.cs | 131 ++
.../ByTask/Feeds/TrecContentSource.cs | 350 ++++++
.../ByTask/Feeds/TrecDocParser.cs | 159 +++
.../ByTask/Feeds/TrecFBISParser.cs | 68 +
.../ByTask/Feeds/TrecFR94Parser.cs | 69 +
.../ByTask/Feeds/TrecFTParser.cs | 58 +
.../ByTask/Feeds/TrecGov2Parser.cs | 57 +
.../ByTask/Feeds/TrecLATimesParser.cs | 75 ++
.../ByTask/Feeds/TrecParserByPath.cs | 34 +
src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs | 490 ++++++++
.../ByTask/Programmatic/Sample.cs | 90 ++
src/Lucene.Net.Benchmark/ByTask/Stats/Points.cs | 108 ++
src/Lucene.Net.Benchmark/ByTask/Stats/Report.cs | 70 ++
.../ByTask/Stats/TaskStats.cs | 237 ++++
.../ByTask/Tasks/AddDocTask.cs | 93 ++
.../ByTask/Tasks/AddFacetedDocTask.cs | 95 ++
.../ByTask/Tasks/AddIndexesTask.cs | 104 ++
.../ByTask/Tasks/AnalyzerFactoryTask.cs | 580 +++++++++
.../ByTask/Tasks/BenchmarkHighlighter.cs | 32 +
.../ByTask/Tasks/ClearStatsTask.cs | 44 +
.../ByTask/Tasks/CloseIndexTask.cs | 67 +
.../ByTask/Tasks/CloseReaderTask.cs | 49 +
.../ByTask/Tasks/CloseTaxonomyIndexTask.cs | 42 +
.../ByTask/Tasks/CloseTaxonomyReaderTask.cs | 47 +
.../ByTask/Tasks/CommitIndexTask.cs | 62 +
.../ByTask/Tasks/CommitTaxonomyIndexTask.cs | 48 +
.../ByTask/Tasks/ConsumeContentSourceTask.cs | 48 +
.../ByTask/Tasks/CreateIndexTask.cs | 225 ++++
.../ByTask/Tasks/CreateTaxonomyIndexTask.cs | 42 +
.../ByTask/Tasks/ForceMergeTask.cs | 61 +
.../ByTask/Tasks/NearRealtimeReaderTask.cs | 132 ++
.../ByTask/Tasks/NewAnalyzerTask.cs | 189 +++
.../ByTask/Tasks/NewCollationAnalyzerTask.cs | 149 +++
.../ByTask/Tasks/NewLocaleTask.cs | 97 ++
.../ByTask/Tasks/NewRoundTask.cs | 44 +
.../ByTask/Tasks/OpenIndexTask.cs | 88 ++
.../ByTask/Tasks/OpenReaderTask.cs | 100 ++
.../ByTask/Tasks/OpenTaxonomyIndexTask.cs | 41 +
.../ByTask/Tasks/OpenTaxonomyReaderTask.cs | 44 +
.../ByTask/Tasks/PerfTask.cs | 380 ++++++
.../ByTask/Tasks/PrintReaderTask.cs | 60 +
.../ByTask/Tasks/ReadTask.cs | 339 +++++
.../ByTask/Tasks/ReadTokensTask.cs | 160 +++
.../ByTask/Tasks/ReopenReaderTask.cs | 45 +
.../ByTask/Tasks/RepAllTask.cs | 83 ++
.../ByTask/Tasks/RepSelectByPrefTask.cs | 81 ++
.../ByTask/Tasks/RepSumByNameRoundTask.cs | 83 ++
.../ByTask/Tasks/RepSumByNameTask.cs | 81 ++
.../ByTask/Tasks/RepSumByPrefRoundTask.cs | 79 ++
.../ByTask/Tasks/RepSumByPrefTask.cs | 91 ++
.../ByTask/Tasks/ReportTask.cs | 189 +++
.../ByTask/Tasks/ResetInputsTask.cs | 43 +
.../ByTask/Tasks/ResetSystemEraseTask.cs | 42 +
.../ByTask/Tasks/ResetSystemSoftTask.cs | 41 +
.../ByTask/Tasks/RollbackIndexTask.cs | 52 +
.../ByTask/Tasks/SearchTask.cs | 60 +
.../ByTask/Tasks/SearchTravRetHighlightTask.cs | 188 +++
.../Tasks/SearchTravRetLoadFieldSelectorTask.cs | 85 ++
.../ByTask/Tasks/SearchTravRetTask.cs | 44 +
.../Tasks/SearchTravRetVectorHighlightTask.cs | 191 +++
.../ByTask/Tasks/SearchTravTask.cs | 87 ++
.../ByTask/Tasks/SearchWithCollectorTask.cs | 99 ++
.../ByTask/Tasks/SearchWithSortTask.cs | 157 +++
.../ByTask/Tasks/SetPropTask.cs | 71 ++
.../ByTask/Tasks/TaskSequence.cs | 662 ++++++++++
.../ByTask/Tasks/UpdateDocTask.cs | 99 ++
.../ByTask/Tasks/WaitForMergesTask.cs | 36 +
.../ByTask/Tasks/WaitTask.cs | 89 ++
.../ByTask/Tasks/WarmTask.cs | 64 +
.../ByTask/Tasks/WriteEnwikiLineDocTask.cs | 72 ++
.../ByTask/Tasks/WriteLineDocTask.cs | 238 ++++
.../ByTask/Utils/Algorithm.cs | 459 +++++++
.../ByTask/Utils/AnalyzerFactory.cs | 156 +++
src/Lucene.Net.Benchmark/ByTask/Utils/Config.cs | 559 +++++++++
.../ByTask/Utils/FileUtils.cs | 46 +
src/Lucene.Net.Benchmark/ByTask/Utils/Format.cs | 109 ++
.../ByTask/Utils/StreamUtils.cs | 132 ++
src/Lucene.Net.Benchmark/Constants.cs | 33 +
.../Lucene.Net.Benchmark.csproj | 214 ++++
.../Lucene.Net.Benchmark.project.json | 15 +
.../Properties/AssemblyInfo.cs | 30 +
src/Lucene.Net.Benchmark/Quality/Judge.cs | 55 +
.../Quality/QualityBenchmark.cs | 159 +++
.../Quality/QualityQuery.cs | 107 ++
.../Quality/QualityQueryParser.cs | 35 +
.../Quality/QualityStats.cs | 339 +++++
.../Quality/Trec/QueryDriver.cs | 93 ++
.../Quality/Trec/Trec1MQReader.cs | 92 ++
.../Quality/Trec/TrecJudge.cs | 186 +++
.../Quality/Trec/TrecTopicsReader.cs | 154 +++
.../Quality/Utils/DocNameExtractor.cs | 89 ++
.../Quality/Utils/QualityQueriesFinder.cs | 152 +++
.../Quality/Utils/SimpleQQParser.cs | 76 ++
.../Quality/Utils/SubmissionReport.cs | 98 ++
.../Utils/ExtractReuters.cs | 167 +++
.../Utils/ExtractWikipedia.cs | 178 +++
src/Lucene.Net.Benchmark/project.json | 53 +
src/Lucene.Net.TestFramework/Util/TestUtil.cs | 22 +-
.../BenchmarkTestCase.cs | 129 ++
.../ByTask/Feeds/DocMakerTest.cs | 193 +++
.../ByTask/Feeds/EnwikiContentSourceTest.cs | 194 +++
.../ByTask/Feeds/LineDocSourceTest.cs | 271 ++++
.../ByTask/Feeds/TestHtmlParser.cs | 164 +++
.../ByTask/Feeds/TrecContentSourceTest.cs | 431 +++++++
.../ByTask/Feeds/trecdocs.zip | Bin 0 -> 2514 bytes
.../ByTask/Tasks/AddIndexesTaskTest.cs | 153 +++
.../ByTask/Tasks/Alt/AltPackageTaskTest.cs | 68 +
.../ByTask/Tasks/Alt/AltTestTask.cs | 35 +
.../ByTask/Tasks/CommitIndexTaskTest.cs | 63 +
.../ByTask/Tasks/CountingHighlighterTestTask.cs | 85 ++
.../ByTask/Tasks/CountingSearchTestTask.cs | 65 +
.../ByTask/Tasks/CreateIndexTaskTest.cs | 129 ++
.../ByTask/Tasks/PerfTaskTest.cs | 81 ++
.../ByTask/Tasks/SearchWithSortTaskTest.cs | 35 +
.../ByTask/Tasks/WriteEnwikiLineDocTaskTest.cs | 121 ++
.../ByTask/Tasks/WriteLineDocTaskTest.cs | 436 +++++++
.../ByTask/TestPerfTasksLogic.cs | 1177 ++++++++++++++++++
.../ByTask/TestPerfTasksParse.cs | 178 +++
.../ByTask/Utils/StreamUtilsTest.cs | 149 +++
.../ByTask/Utils/TestConfig.cs | 37 +
src/Lucene.Net.Tests.Benchmark/ByTask/conf.zip | Bin 0 -> 40878 bytes
.../ByTask/reuters.first20.lines.txt | 20 +
.../test-mapping-ISOLatin1Accent-partial.txt | 30 +
.../Conf/ConfLoader.cs | 28 +
.../Lucene.Net.Tests.Benchmark.csproj | 129 ++
.../Lucene.Net.Tests.Benchmark.project.json | 13 +
.../Properties/AssemblyInfo.cs | 36 +
.../Quality/TestQualityRun.cs | 210 ++++
.../Quality/reuters.578.lines.txt.bz2 | Bin 0 -> 208314 bytes
.../Quality/trecQRels.txt | 723 +++++++++++
.../Quality/trecTopics.txt | 287 +++++
.../Support/TestApiConsistency.cs | 150 +++
.../Support/TestExceptionSerialization.cs | 54 +
src/Lucene.Net.Tests.Benchmark/project.json | 56 +
162 files changed, 22383 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/Lucene.Net.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.sln b/Lucene.Net.sln
index 5450020..08a00a0 100644
--- a/Lucene.Net.sln
+++ b/Lucene.Net.sln
@@ -110,6 +110,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analysis.Kuromoj
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Analysis.Kuromoji", "src\Lucene.Net.Tests.Analysis.Kuromoji\Lucene.Net.Tests.Analysis.Kuromoji.csproj", "{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Benchmark", "src\Lucene.Net.Benchmark\Lucene.Net.Benchmark.csproj", "{EDC77CB4-597F-4818-8C83-3C006D12C384}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Benchmark", "src\Lucene.Net.Tests.Benchmark\Lucene.Net.Tests.Benchmark.csproj", "{9257F543-44E2-4DB6-8B27-A8A354C13E5B}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -1111,6 +1115,54 @@ Global
{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|x86.ActiveCfg = Release|Any CPU
{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|x86.Build.0 = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|x86.Build.0 = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Any CPU.Build.0 = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|x86.ActiveCfg = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|x86.Build.0 = Debug|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Any CPU.Build.0 = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|x86.ActiveCfg = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|x86.Build.0 = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Any CPU.ActiveCfg = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Any CPU.Build.0 = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|x86.ActiveCfg = Release|Any CPU
+ {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|x86.Build.0 = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|x86.Build.0 = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Any CPU.Build.0 = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|x86.ActiveCfg = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|x86.Build.0 = Debug|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Any CPU.Build.0 = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|x86.ActiveCfg = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|x86.Build.0 = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Any CPU.ActiveCfg = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Any CPU.Build.0 = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|x86.ActiveCfg = Release|Any CPU
+ {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs b/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs
new file mode 100644
index 0000000..9f3ad70
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs
@@ -0,0 +1,170 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Run the benchmark algorithm.
+ /// </summary>
+ /// <remarks>
+ /// <list type="number">
+ /// <item><description>Read algorithm.</description></item>
+ /// <item><description>Run the algorithm.</description></item>
+ /// </list>
+ /// <para/>
+ /// Things to be added/fixed in "Benchmarking by tasks":
+ /// <list type="number">
+ /// <item><description>TODO - report into Excel and/or graphed view.</description></item>
+ /// <item><description>TODO - perf comparison between Lucene releases over the years.</description></item>
+ /// <item><description>TODO - perf report adequate to include in Lucene nightly build site? (so we can easily track performance changes.)</description></item>
+ /// <item><description>TODO - add overall time control for repeated execution (vs. current by-count only).</description></item>
+ /// <item><description>TODO - query maker that is based on index statistics.</description></item>
+ /// </list>
+ /// </remarks>
+ public class Benchmark
+ {
+ private PerfRunData runData;
+ private Algorithm algorithm;
+ private bool executed;
+
+ public Benchmark(TextReader algReader)
+ {
+ // prepare run data
+ try
+ {
+ runData = new PerfRunData(new Config(algReader));
+ }
+ catch (Exception e)
+ {
+ //e.printStackTrace();
+ throw new Exception("Error: cannot init PerfRunData!", e);
+ }
+
+ // parse algorithm
+ try
+ {
+ algorithm = new Algorithm(runData);
+ }
+ catch (Exception e)
+ {
+ throw new Exception("Error: cannot understand algorithm!", e);
+ }
+ }
+
+ /// <summary>
+ /// Execute this benchmark.
+ /// </summary>
+ public virtual void Execute()
+ {
+ lock (this)
+ {
+ if (executed)
+ {
+ throw new InvalidOperationException("Benchmark was already executed");
+ }
+ executed = true;
+ runData.SetStartTimeMillis();
+ algorithm.Execute();
+ }
+ }
+
+ /// <summary>
+ /// Run the benchmark algorithm.
+ /// </summary>
+ /// <param name="args">Benchmark config and algorithm files.</param>
+ public static void Main(string[] args)
+ {
+ Exec(args);
+ }
+
+ /// <summary>
+ /// Utility: execute benchmark from command line.
+ /// </summary>
+ /// <param name="args">Single argument is expected: algorithm-file.</param>
+ public static void Exec(string[] args)
+ {
+ // verify command line args
+ if (args.Length < 1)
+ {
+ SystemConsole.WriteLine("Usage: java Benchmark <algorithm file>");
+ Environment.Exit(1);
+ }
+
+ // verify input files
+ FileInfo algFile = new FileInfo(args[0]);
+ if (!algFile.Exists /*|| !algFile.isFile() ||!algFile.canRead()*/ )
+ {
+ SystemConsole.WriteLine("cannot find/read algorithm file: " + algFile.FullName);
+ Environment.Exit(1);
+ }
+
+ SystemConsole.WriteLine("Running algorithm from: " + algFile.FullName);
+
+ Benchmark benchmark = null;
+ try
+ {
+ benchmark = new Benchmark(IOUtils.GetDecodingReader(algFile, Encoding.UTF8));
+ }
+ catch (Exception e)
+ {
+ SystemConsole.WriteLine(e.ToString());
+ Environment.Exit(1);
+ }
+
+ SystemConsole.WriteLine("------------> algorithm:");
+ SystemConsole.WriteLine(benchmark.Algorithm.ToString());
+
+ // execute
+ try
+ {
+ benchmark.Execute();
+ }
+ catch (Exception e)
+ {
+ SystemConsole.WriteLine("Error: cannot execute the algorithm! " + e.Message);
+ SystemConsole.WriteLine(e.StackTrace);
+ }
+
+ SystemConsole.WriteLine("####################");
+ SystemConsole.WriteLine("### D O N E !!! ###");
+ SystemConsole.WriteLine("####################");
+ }
+
+ /// <summary>
+ /// Returns the algorithm.
+ /// </summary>
+ public virtual Algorithm Algorithm
+ {
+ get { return algorithm; }
+ }
+
+ /// <summary>
+ /// Returns the runData.
+ /// </summary>
+ public virtual PerfRunData RunData
+ {
+ get { return runData; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs
new file mode 100644
index 0000000..fb6a2bf
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs
@@ -0,0 +1,85 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Search;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Abstract base query maker.
+ /// Each query maker should just implement the <see cref="PrepareQueries()"/> method.
+ /// </summary>
+ public abstract class AbstractQueryMaker : IQueryMaker
+ {
+ protected int m_qnum = 0;
+ protected Query[] m_queries;
+ protected Config m_config;
+
+ public virtual void ResetInputs()
+ {
+ m_qnum = 0;
+ }
+
+ protected abstract Query[] PrepareQueries();
+
+ public virtual void SetConfig(Config config)
+ {
+ this.m_config = config;
+ m_queries = PrepareQueries();
+ }
+
+ public virtual string PrintQueries()
+ {
+ string newline = Environment.NewLine;
+ StringBuilder sb = new StringBuilder();
+ if (m_queries != null)
+ {
+ for (int i = 0; i < m_queries.Length; i++)
+ {
+ sb.Append(i + ". " + m_queries[i].GetType().Name + " - " + m_queries[i].ToString());
+ sb.Append(newline);
+ }
+ }
+ return sb.ToString();
+ }
+
+ public virtual Query MakeQuery()
+ {
+ return m_queries[NextQnum()];
+ }
+
+ // return next qnum
+ protected virtual int NextQnum()
+ {
+ lock (this)
+ {
+ int res = m_qnum;
+ m_qnum = (m_qnum + 1) % m_queries.Length;
+ return res;
+ }
+ }
+
+ /// <seealso cref="IQueryMaker.MakeQuery(int)"/>
+ public virtual Query MakeQuery(int size)
+ {
+ throw new Exception(this + ".MakeQuery(int size) is not supported!");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
new file mode 100644
index 0000000..c0f06ef
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
@@ -0,0 +1,227 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Reflection;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Base class for source of data for benchmarking.
+ /// </summary>
+ /// <remarks>
+ /// Keeps track of various statistics, such as how many data items were generated,
+ /// size in bytes etc.
+ /// <para/>
+ /// Supports the following configuration parameters:
+ /// <list type="bullet">
+ /// <item><term>content.source.forever</term><description>specifies whether to generate items forever (<b>default=true</b>).</description></item>
+ /// <item><term>content.source.verbose</term><description>specifies whether messages should be output by the content source (<b>default=false</b>).</description></item>
+ /// <item><term>content.source.encoding</term><description>
+ /// specifies which encoding to use when
+ /// reading the files of that content source. Certain implementations may define
+ /// a default value if this parameter is not specified. (<b>default=null</b>).
+ /// </description></item>
+ /// <item><term>content.source.log.step</term><description>
+ /// specifies for how many items a
+ /// message should be logged. If set to 0 it means no logging should occur.
+ /// <b>NOTE:</b> if verbose is set to false, logging should not occur even if
+ /// logStep is not 0 (<b>default=0</b>).
+ /// </description></item>
+ /// </list>
+ /// </remarks>
+ public abstract class ContentItemsSource : IDisposable
+ {
+ private long bytesCount;
+ private long totalBytesCount;
+ private int itemCount;
+ private int totalItemCount;
+ private Config config;
+
+ private int lastPrintedNumUniqueTexts = 0;
+ private long lastPrintedNumUniqueBytes = 0;
+ private int printNum = 0;
+
+ protected bool m_forever;
+ protected int m_logStep;
+ protected bool m_verbose;
+ protected Encoding m_encoding;
+
+ /// <summary>update count of bytes generated by this source</summary>
+ protected void AddBytes(long numBytes)
+ {
+ lock (this)
+ {
+ bytesCount += numBytes;
+ totalBytesCount += numBytes;
+ }
+ }
+
+ /// <summary>update count of items generated by this source</summary>
+ protected void AddItem()
+ {
+ lock (this)
+ {
+ ++itemCount;
+ ++totalItemCount;
+ }
+ }
+
+ /// <summary>
+ /// A convenience method for collecting all the files of a content source from
+ /// a given directory. The collected <see cref="FileInfo"/> instances are stored in the
+ /// given <paramref name="files"/>.
+ /// </summary>
+ protected void CollectFiles(DirectoryInfo dir, IList<FileInfo> files)
+ {
+ CollectFilesImpl(dir, files);
+ files.Sort(new FileNameComparer());
+ }
+
+ private void CollectFilesImpl(DirectoryInfo dir, IList<FileInfo> files)
+ {
+ foreach (var sub in dir.EnumerateDirectories())
+ {
+ CollectFilesImpl(sub, files);
+ }
+
+ files.AddRange(dir.GetFiles());
+ }
+
+ private class FileNameComparer : IComparer<FileInfo>
+ {
+ public int Compare(FileInfo x, FileInfo y)
+ {
+ return x.FullName.CompareToOrdinal(y.FullName);
+ }
+ }
+
+ /// <summary>
+ /// Returns <c>true</c> whether it's time to log a message (depending on verbose and
+ /// the number of items generated).
+ /// </summary>
+ /// <returns></returns>
+ protected bool ShouldLog()
+ {
+ return m_verbose && m_logStep > 0 && itemCount % m_logStep == 0;
+ }
+
+ /// <summary>Called when reading from this content source is no longer required.</summary>
+ public void Dispose()
+ {
+ Dispose(true);
+ GC.SuppressFinalize(this);
+ }
+
+ /// <summary>Called when reading from this content source is no longer required.</summary>
+ protected abstract void Dispose(bool disposing);
+
+
+ /// <summary>Returns the number of bytes generated since last reset.</summary>
+ public long BytesCount { get { return bytesCount; } }
+
+ /// <summary>Returns the number of generated items since last reset.</summary>
+ public int ItemsCount { get { return itemCount; } }
+
+ public Config Config { get { return config; } }
+
+ /// <summary>Returns the total number of bytes that were generated by this source.</summary>
+ public long TotalBytesCount { get { return totalBytesCount; } }
+
+ /// <summary>Returns the total number of generated items.</summary>
+ public int TotalItemsCount { get { return totalItemCount; } }
+
+ /// <summary>
+ /// Resets the input for this content source, so that the test would behave as
+ /// if it was just started, input-wise.
+ /// <para/>
+ /// <b>NOTE:</b> the default implementation resets the number of bytes and
+ /// items generated since the last reset, so it's important to call
+ /// <c>base.ResetInputs()</c> in case you override this method.
+ /// </summary>
+ public virtual void ResetInputs()
+ {
+ bytesCount = 0;
+ itemCount = 0;
+ }
+
+ /// <summary>
+ /// Sets the <see cref="Utils.Config"/> for this content source. If you override this
+ /// method, you must call <c>base.SetConfig(config)</c>.
+ /// </summary>
+ /// <param name="config"></param>
+ public virtual void SetConfig(Config config)
+ {
+ this.config = config;
+ m_forever = config.Get("content.source.forever", true);
+ m_logStep = config.Get("content.source.log.step", 0);
+ m_verbose = config.Get("content.source.verbose", false);
+ string encodingStr = config.Get("content.source.encoding", null);
+ if (!string.IsNullOrWhiteSpace(encodingStr))
+ {
+ m_encoding = Encoding.GetEncoding(encodingStr);
+ }
+ else
+ {
+ m_encoding = Encoding.GetEncoding(0); // Default system encoding
+ }
+ }
+
+ public virtual void PrintStatistics(string itemsName)
+ {
+ if (!m_verbose)
+ {
+ return;
+ }
+ bool print = false;
+ string col = " ";
+ StringBuilder sb = new StringBuilder();
+ string newline = Environment.NewLine;
+ sb.Append("------------> ").Append(GetType().GetTypeInfo().Name).Append(" statistics (").Append(printNum).Append("): ").Append(newline);
+ int nut = TotalItemsCount;
+ if (nut > lastPrintedNumUniqueTexts)
+ {
+ print = true;
+ sb.Append("total count of " + itemsName + ": ").Append(Formatter.Format(0, nut, col)).Append(newline);
+ lastPrintedNumUniqueTexts = nut;
+ }
+ long nub = TotalBytesCount;
+ if (nub > lastPrintedNumUniqueBytes)
+ {
+ print = true;
+ sb.Append("total bytes of " + itemsName + ": ").Append(Formatter.Format(0, nub, col)).Append(newline);
+ lastPrintedNumUniqueBytes = nub;
+ }
+ if (ItemsCount > 0)
+ {
+ print = true;
+ sb.Append("num " + itemsName + " added since last inputs reset: ").Append(Formatter.Format(0, ItemsCount, col)).Append(newline);
+ sb.Append("total bytes added for " + itemsName + " since last inputs reset: ").Append(Formatter.Format(0, BytesCount, col)).Append(newline);
+ }
+ if (print)
+ {
+ SystemConsole.WriteLine(sb.Append(newline).ToString());
+ printNum++;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs
new file mode 100644
index 0000000..a3c39cb
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs
@@ -0,0 +1,38 @@
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Represents content from a specified source, such as TREC, Reuters etc. A
+ /// <see cref="ContentSource"/> is responsible for creating <see cref="DocData"/> objects for
+ /// its documents to be consumed by <see cref="DocMaker"/>. It also keeps track
+ /// of various statistics, such as how many documents were generated, size in
+ /// bytes etc.
+ /// <para/>
+ /// For supported configuration parameters see <see cref="ContentItemsSource"/>.
+ /// </summary>
+ public abstract class ContentSource : ContentItemsSource
+ {
+ /// <summary>
+ /// Returns the next <see cref="DocData"/> from the content source.
+ /// Implementations must account for multi-threading, as multiple threads
+ /// can call this method simultaneously.
+ /// </summary>
+ public abstract DocData GetNextDocData(DocData docData);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs
new file mode 100644
index 0000000..0903754
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs
@@ -0,0 +1,259 @@
+// LUCENENET TODO: Use HTML Agility pack instead of SAX ?
+
+using Lucene.Net.Support;
+using Sax.Net;
+using Sax.Net.Helpers;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple HTML Parser extracting title, meta tags, and body text
+ /// that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
+ /// </summary>
+ public class DemoHTMLParser : IHTMLParser
+ {
+ /// <summary>The actual parser to read HTML documents.</summary>
+ public sealed class Parser
+ {
+ private readonly IDictionary<string, string> metaTags = new Dictionary<string, string>();
+ private readonly string title, body;
+
+ // LUCENENET specific - expose field through property
+ public IDictionary<string, string> MetaTags
+ {
+ get { return metaTags; }
+ }
+
+ // LUCENENET specific - expose field through property
+ public string Title
+ {
+ get { return title; }
+ }
+
+ // LUCENENET specific - expose field through property
+ public string Body
+ {
+ get { return body; }
+ }
+
+ public Parser(TextReader reader)
+ : this(new InputSource(reader))
+ {
+ }
+
+ public Parser(InputSource source)
+ {
+ TagSoup.Net.Parser parser = new TagSoup.Net.Parser();
+
+ parser.SetFeature(TagSoup.Net.Parser.NAMESPACES_FEATURE, true);
+
+ StringBuilder title = new StringBuilder(), body = new StringBuilder();
+ DefaultHandler handler = new DefaultHandlerAnonymousHelper(this, title, body);
+
+ parser.ContentHandler = handler;
+ parser.ErrorHandler = handler;
+ parser.Parse(source);
+
+ // the javacc-based parser trimmed title (which should be done for HTML in all cases):
+ this.title = title.ToString().Trim();
+
+ // assign body text
+ this.body = body.ToString();
+ }
+
+ private class DefaultHandlerAnonymousHelper : DefaultHandler
+ {
+ private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
+
+ private readonly Parser outerInstance;
+ private readonly StringBuilder title;
+ private readonly StringBuilder body;
+
+ public DefaultHandlerAnonymousHelper(Parser outerInstance, StringBuilder title, StringBuilder body)
+ {
+ this.outerInstance = outerInstance;
+ this.title = title;
+ this.body = body;
+ }
+
+ public override void StartElement(string uri, string localName, string qName, IAttributes atts)
+ {
+ if (inHEAD > 0)
+ {
+ if ("title".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ inTITLE++;
+ }
+ else
+ {
+ if ("meta".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ string name = atts.GetValue("name");
+ if (name == null)
+ {
+ name = atts.GetValue("http-equiv");
+ }
+ string val = atts.GetValue("content");
+ if (name != null && val != null)
+ {
+ outerInstance.metaTags[name.ToLowerInvariant()] = val;
+ }
+ }
+ }
+ }
+ else if (inBODY > 0)
+ {
+ if (SUPPRESS_ELEMENTS.Contains(localName))
+ {
+ suppressed++;
+ }
+ else if ("img".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ // the original javacc-based parser preserved <IMG alt="..."/>
+ // attribute as body text in [] parenthesis:
+ string alt = atts.GetValue("alt");
+ if (alt != null)
+ {
+ body.Append('[').Append(alt).Append(']');
+ }
+ }
+ }
+ else if ("body".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ inBODY++;
+ }
+ else if ("head".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ inHEAD++;
+ }
+ else if ("frameset".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ throw new SAXException("This parser does not support HTML framesets.");
+ }
+ }
+
+ public override void EndElement(string uri, string localName, string qName)
+ {
+ if (inBODY > 0)
+ {
+ if ("body".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ inBODY--;
+ }
+ else if (ENDLINE_ELEMENTS.Contains(localName))
+ {
+ body.Append('\n');
+ }
+ else if (SUPPRESS_ELEMENTS.Contains(localName))
+ {
+ suppressed--;
+ }
+ }
+ else if (inHEAD > 0)
+ {
+ if ("head".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ inHEAD--;
+ }
+ else if (inTITLE > 0 && "title".Equals(localName, StringComparison.OrdinalIgnoreCase))
+ {
+ inTITLE--;
+ }
+ }
+ }
+
+ public override void Characters(char[] ch, int start, int length)
+ {
+ if (inBODY > 0 && suppressed == 0)
+ {
+ body.Append(ch, start, length);
+ }
+ else if (inTITLE > 0)
+ {
+ title.Append(ch, start, length);
+ }
+ }
+
+ public override InputSource ResolveEntity(string publicId, string systemId)
+ {
+ // disable network access caused by DTDs
+ return new InputSource(new StringReader(""));
+ }
+ }
+
+ private static ISet<string> CreateElementNameSet(params string[] names)
+ {
+ return Collections.UnmodifiableSet(new HashSet<string>(names));
+ }
+
+ /// <summary>HTML elements that cause a line break (they are block-elements).</summary>
+ internal static readonly ISet<string> ENDLINE_ELEMENTS = CreateElementNameSet(
+ "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+ "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+ "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
+ );
+
+ /// <summary>HTML elements with contents that are ignored.</summary>
+ internal static readonly ISet<string> SUPPRESS_ELEMENTS = CreateElementNameSet(
+ "style", "script"
+ );
+ }
+ public virtual DocData Parse(DocData docData, string name, DateTime? date, TextReader reader, TrecContentSource trecSrc)
+ {
+ try
+ {
+ return Parse(docData, name, date, new InputSource(reader), trecSrc);
+ }
+ catch (SAXException saxe)
+ {
+ throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
+ }
+ }
+
+ public virtual DocData Parse(DocData docData, string name, DateTime? date, InputSource source, TrecContentSource trecSrc)
+ {
+ Parser p = new Parser(source);
+
+ // properties
+ IDictionary<string, string> props = p.MetaTags;
+ string dateStr;
+ if (props.TryGetValue("date", out dateStr) && dateStr != null)
+ {
+ DateTime? newDate = trecSrc.ParseDate(dateStr);
+ if (newDate != null)
+ {
+ date = newDate;
+ }
+ }
+
+ docData.Clear();
+ docData.Name = name;
+ docData.Body = p.Body;
+ docData.Title = p.Title;
+ docData.Props = props;
+ docData.SetDate(date);
+ return docData;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs
new file mode 100644
index 0000000..c14d578
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs
@@ -0,0 +1,259 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Support;
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Text;
+
+// LUCENENET TODO: This had to be refactored significantly. We need tests to confirm it works.
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <see cref="ContentSource"/> using the Dir collection for its input. Supports
+ /// the following configuration parameters (on top of <see cref="ContentSource"/>):
+ /// <list type="bullet">
+ /// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir" denotes a relative path (<b>default=work</b>).</description></item>
+ /// <item><term>docs.dir</term><description>specifies the directory the Dir collection. Can be set to a relative path if "work.dir" is also specified (<b>default=dir-out</b>).</description></item>
+ /// </list>
+ /// </summary>
+ public class DirContentSource : ContentSource
+ {
+ /// <summary>
+ /// Iterator over the files in the directory.
+ /// </summary>
+ public class Iterator : IEnumerator<FileInfo>
+ {
+
+ private class Comparer : IComparer<FileInfo>
+ {
+ public int Compare(FileInfo a, FileInfo b)
+ {
+ string a2 = a.ToString();
+ string b2 = b.ToString();
+ int diff = a2.Length - b2.Length;
+
+ if (diff > 0)
+ {
+ while (diff-- > 0)
+ {
+ b2 = "0" + b2;
+ }
+ }
+ else if (diff < 0)
+ {
+ diff = -diff;
+ while (diff-- > 0)
+ {
+ a2 = "0" + a2;
+ }
+ }
+
+ /* note it's reversed because we're going to push,
+ which reverses again */
+ return b2.CompareToOrdinal(a2);
+ }
+ }
+
+ internal int count = 0;
+
+ internal Stack<FileInfo> stack = new Stack<FileInfo>();
+
+ /* this seems silly ... there must be a better way ...
+ not that this is good, but can it matter? */
+
+ private Comparer c = new Comparer();
+
+ private FileInfo current;
+
+ public Iterator(DirectoryInfo f)
+ {
+ Push(f);
+ }
+
+ internal void Push(DirectoryInfo f)
+ {
+ foreach (var dir in f.GetDirectories())
+ {
+ Push(dir);
+ }
+
+ Push(f.GetFiles("*.txt"));
+ }
+
+ internal void Push(FileInfo[] files)
+ {
+ Array.Sort(files, c);
+ for (int i = 0; i < files.Length; i++)
+ {
+ // System.err.println("push " + files[i]);
+ stack.Push(files[i]);
+ }
+ }
+
+ public virtual int Count
+ {
+ get { return count; }
+ }
+
+ public virtual bool MoveNext()
+ {
+ if (stack.Count == 0)
+ {
+ current = null;
+ return false;
+ }
+ count++;
+ current = stack.Pop();
+ // System.err.println("pop " + object);
+ return true;
+ }
+
+ public virtual FileInfo Current
+ {
+ get { return current; }
+ }
+
+ object IEnumerator.Current
+ {
+ get { return current; }
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ GC.SuppressFinalize(this);
+ }
+
+ protected virtual void Dispose(bool disposing)
+ {
+ }
+
+ public virtual void Reset()
+ {
+ }
+ }
+
+ private DirectoryInfo dataDir = null;
+ private int iteration = 0;
+ private Iterator inputFiles = null;
+
+ private DateTime? ParseDate(string dateStr)
+ {
+ DateTime temp;
+ if (DateTime.TryParseExact(dateStr, "dd-MMM-yyyy hh:mm:ss.fff", CultureInfo.InvariantCulture, DateTimeStyles.None, out temp))
+ {
+ return temp;
+ }
+ else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp))
+ {
+ return temp;
+ }
+
+ return null;
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing)
+ {
+ inputFiles = null;
+ }
+ }
+
+ public override DocData GetNextDocData(DocData docData)
+ {
+ FileInfo f = null;
+ string name = null;
+ lock (this)
+ {
+ if (!inputFiles.MoveNext())
+ {
+ // exhausted files, start a new round, unless forever set to false.
+ if (!m_forever)
+ {
+ throw new NoMoreDataException();
+ }
+ inputFiles = new Iterator(dataDir);
+ iteration++;
+ }
+ f = inputFiles.Current;
+ // System.err.println(f);
+ name = f.FullName + "_" + iteration;
+ }
+
+ string line = null;
+ string dateStr;
+ string title;
+ StringBuilder bodyBuf = new StringBuilder(1024);
+
+ using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8))
+ {
+ //First line is the date, 3rd is the title, rest is body
+ dateStr = reader.ReadLine();
+ reader.ReadLine();//skip an empty line
+ title = reader.ReadLine();
+ reader.ReadLine();//skip an empty line
+ while ((line = reader.ReadLine()) != null)
+ {
+ bodyBuf.Append(line).Append(' ');
+ }
+ }
+ AddBytes(f.Length);
+
+ DateTime? date = ParseDate(dateStr);
+
+ docData.Clear();
+ docData.Name = name;
+ docData.Body = bodyBuf.ToString();
+ docData.Title = title;
+ docData.SetDate(date);
+ return docData;
+ }
+
+ public override void ResetInputs()
+ {
+ lock (this)
+ {
+ base.ResetInputs();
+ inputFiles = new Iterator(dataDir);
+ iteration = 0;
+ }
+ }
+
+ public override void SetConfig(Config config)
+ {
+ base.SetConfig(config);
+
+ DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
+ string d = config.Get("docs.dir", "dir-out");
+ dataDir = new DirectoryInfo(d);
+
+ inputFiles = new Iterator(dataDir);
+
+ if (inputFiles == null)
+ {
+ throw new Exception("No txt files in dataDir: " + dataDir.FullName);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs
new file mode 100644
index 0000000..9e68a4e
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs
@@ -0,0 +1,73 @@
+using Lucene.Net.Documents;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Output of parsing (e.g. HTML parsing) of an input document.
+ /// </summary>
+ public class DocData
+ {
+ public string Name { get; set; }
+ public string Body { get; set; }
+ public string Title { get; set; }
+ private string date;
+ public int ID { get; set; }
+ public IDictionary<string, string> Props { get; set; }
+
+ public void Clear()
+ {
+ Name = null;
+ Body = null;
+ Title = null;
+ date = null;
+ Props = null;
+ ID = -1;
+ }
+
+ /// <summary>
+ /// Gets the date. If the ctor with <see cref="DateTime"/> was called, then the string
+ /// returned is the output of <see cref="DateTools.DateToString(DateTime, DateTools.Resolution)"/>.
+ /// Otherwise it's the string passed to the other ctor.
+ /// </summary>
+ public virtual string Date
+ {
+ get { return date; }
+ }
+
+ public virtual void SetDate(DateTime? date)
+ {
+ if (date.HasValue)
+ {
+ SetDate(DateTools.DateToString(date.Value, DateTools.Resolution.SECOND));
+ }
+ else
+ {
+ this.date = null;
+ }
+ }
+
+ public virtual void SetDate(string date)
+ {
+ this.date = date;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs
new file mode 100644
index 0000000..8ff3e7b
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs
@@ -0,0 +1,511 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Documents;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.Text;
+using System.Threading;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Creates <see cref="Document"/> objects. Uses a <see cref="ContentSource"/> to generate
+ /// <see cref="DocData"/> objects.
+ /// </summary>
+ /// <remarks>
+ /// Supports the following parameters:
+ /// <list type="bullet">
+ /// <item><term>content.source</term><description>specifies the <see cref="ContentSource"/> class to use (default <b>SingleDocSource</b>).</description></item>
+ /// <item><term>doc.stored</term><description>specifies whether fields should be stored (default <b>false</b>).</description></item>
+ /// <item><term>doc.body.stored</term><description>specifies whether the body field should be stored (default = <b>doc.stored</b>).</description></item>
+ /// <item><term>doc.tokenized</term><description>specifies whether fields should be tokenized (default <b>true</b>).</description></item>
+ /// <item><term>doc.body.tokenized</term><description>specifies whether the body field should be tokenized (default = <b>doc.tokenized</b>).</description></item>
+ /// <item><term>doc.tokenized.norms</term><description>specifies whether norms should be stored in the index or not. (default <b>false</b>).</description></item>
+ /// <item><term>doc.body.tokenized.norms</term><description>
+ /// specifies whether norms should be stored in the index for the body field.
+ /// This can be set to true, while <c>doc.tokenized.norms</c> is set to false, to allow norms storing just
+ /// for the body field. (default <b>true</b>).
+ /// </description></item>
+ /// <item><term>doc.term.vector</term><description>specifies whether term vectors should be stored for fields (default <b>false</b>).</description></item>
+ /// <item><term>doc.term.vector.positions</term><description>specifies whether term vectors should be stored with positions (default <b>false</b>).</description></item>
+ /// <item><term>doc.term.vector.offsets</term><description>specifies whether term vectors should be stored with offsets (default <b>false</b>).</description></item>
+ /// <item><term>doc.store.body.bytes</term><description>specifies whether to store the raw bytes of the document's content in the document (default <b>false</b>).</description></item>
+ /// <item><term>doc.reuse.fields</term><description>specifies whether <see cref="Field"/> and <see cref="Document"/> objects should be reused (default <b>true</b>).</description></item>
+ /// <item><term>doc.index.props</term><description>specifies whether the properties returned by</description></item>
+ /// <item><term>doc.random.id.limit</term><description>
+ /// if specified, docs will be assigned random
+ /// IDs from 0 to this limit. This is useful with UpdateDoc
+ /// for testing performance of <see cref="Index.IndexWriter.UpdateDocument(Index.Term, IEnumerable{Index.IIndexableField})"/>.
+ /// <see cref="DocData.Props"/> will be indexed. (default <b>false</b>).
+ /// </description></item>
+ /// </list>
+ /// </remarks>
+ public class DocMaker : IDisposable
+ {
+ private class LeftOver
+ {
+ public DocData DocData { get; set; }
+ public int Count { get; set; }
+ }
+
+ private Random r;
+ private int updateDocIDLimit;
+
+ /// <summary>
+ /// Document state, supports reuse of field instances
+ /// across documents (see <c>reuseFields</c> parameter).
+ /// </summary>
+ protected class DocState
+ {
+ private readonly IDictionary<string, Field> fields;
+ private readonly IDictionary<string, Field> numericFields;
+ private readonly bool reuseFields;
+ internal readonly Document doc;
+ internal DocData docData = new DocData();
+
+ public DocState(bool reuseFields, FieldType ft, FieldType bodyFt)
+ {
+
+ this.reuseFields = reuseFields;
+
+ if (reuseFields)
+ {
+ fields = new Dictionary<string, Field>();
+ numericFields = new Dictionary<string, Field>();
+
+ // Initialize the map with the default fields.
+ fields[BODY_FIELD] = new Field(BODY_FIELD, "", bodyFt);
+ fields[TITLE_FIELD] = new Field(TITLE_FIELD, "", ft);
+ fields[DATE_FIELD] = new Field(DATE_FIELD, "", ft);
+ fields[ID_FIELD] = new StringField(ID_FIELD, "", Field.Store.YES);
+ fields[NAME_FIELD] = new Field(NAME_FIELD, "", ft);
+
+ numericFields[DATE_MSEC_FIELD] = new Int64Field(DATE_MSEC_FIELD, 0L, Field.Store.NO);
+ numericFields[TIME_SEC_FIELD] = new Int32Field(TIME_SEC_FIELD, 0, Field.Store.NO);
+
+ doc = new Document();
+ }
+ else
+ {
+ numericFields = null;
+ fields = null;
+ doc = null;
+ }
+ }
+
+ /// <summary>
+ /// Returns a field corresponding to the field name. If
+ /// <c>reuseFields</c> was set to <c>true</c>, then it attempts to reuse a
+ /// <see cref="Field"/> instance. If such a field does not exist, it creates a new one.
+ /// </summary>
+ internal Field GetField(string name, FieldType ft)
+ {
+ if (!reuseFields)
+ {
+ return new Field(name, "", ft);
+ }
+
+ Field f;
+ if (!fields.TryGetValue(name, out f) || f == null)
+ {
+ f = new Field(name, "", ft);
+ fields[name] = f;
+ }
+ return f;
+ }
+
+ internal Field GetNumericField(string name, NumericType type)
+ {
+ Field f;
+ if (reuseFields)
+ {
+ numericFields.TryGetValue(name, out f);
+ }
+ else
+ {
+ f = null;
+ }
+
+ if (f == null)
+ {
+ switch (type)
+ {
+ case NumericType.INT32:
+ f = new Int32Field(name, 0, Field.Store.NO);
+ break;
+ case NumericType.INT64:
+ f = new Int64Field(name, 0L, Field.Store.NO);
+ break;
+ case NumericType.SINGLE:
+ f = new SingleField(name, 0.0F, Field.Store.NO);
+ break;
+ case NumericType.DOUBLE:
+ f = new DoubleField(name, 0.0, Field.Store.NO);
+ break;
+ default:
+ throw new InvalidOperationException("Cannot get here");
+ }
+ if (reuseFields)
+ {
+ numericFields[name] = f;
+ }
+ }
+ return f;
+ }
+ }
+
+ private bool storeBytes = false;
+
+ // LUCENENET specific: DateUtil not used
+
+ // leftovers are thread local, because it is unsafe to share residues between threads
+ private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
+ private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
+
+ public static readonly string BODY_FIELD = "body";
+ public static readonly string TITLE_FIELD = "doctitle";
+ public static readonly string DATE_FIELD = "docdate";
+ public static readonly string DATE_MSEC_FIELD = "docdatenum";
+ public static readonly string TIME_SEC_FIELD = "doctimesecnum";
+ public static readonly string ID_FIELD = "docid";
+ public static readonly string BYTES_FIELD = "bytes";
+ public static readonly string NAME_FIELD = "docname";
+
+ protected Config m_config;
+
+ protected FieldType m_valType;
+ protected FieldType m_bodyValType;
+
+ protected ContentSource m_source;
+ protected bool m_reuseFields;
+ protected bool m_indexProperties;
+
+ private readonly AtomicInt32 numDocsCreated = new AtomicInt32();
+
+ public DocMaker()
+ {
+ }
+
+ // create a doc
+ // use only part of the body, modify it to keep the rest (or use all if size==0).
+ // reset the docdata properties so they are not added more than once.
+ private Document CreateDocument(DocData docData, int size, int cnt)
+ {
+
+ DocState ds = GetDocState();
+ Document doc = m_reuseFields ? ds.doc : new Document();
+ doc.Fields.Clear();
+
+ // Set ID_FIELD
+ FieldType ft = new FieldType(m_valType);
+ ft.IsIndexed = true;
+
+ Field idField = ds.GetField(ID_FIELD, ft);
+ int id;
+ if (r != null)
+ {
+ id = r.Next(updateDocIDLimit);
+ }
+ else
+ {
+ id = docData.ID;
+ if (id == -1)
+ {
+ id = numDocsCreated.GetAndIncrement();
+ }
+ }
+ idField.SetStringValue(Convert.ToString(id, CultureInfo.InvariantCulture));
+ doc.Add(idField);
+
+ // Set NAME_FIELD
+ string name = docData.Name;
+ if (name == null) name = "";
+ name = cnt < 0 ? name : name + "_" + cnt;
+ Field nameField = ds.GetField(NAME_FIELD, m_valType);
+ nameField.SetStringValue(name);
+ doc.Add(nameField);
+
+ // Set DATE_FIELD
+ DateTime? date = null;
+ string dateString = docData.Date;
+ if (dateString != null)
+ {
+ // LUCENENET: TryParseExact needs a non-nullable DateTime to work.
+ DateTime temp;
+ if (DateTime.TryParseExact(dateString, new string[] {
+ // Original format from Java
+ "dd-MMM-yyyy HH:mm:ss",
+ // Actual format from the test files...
+ "yyyyMMddHHmmss"
+ }, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp))
+ {
+ date = temp;
+ }
+ // LUCENENET: Hail Mary in case the formats above are not adequate
+ else if (DateTime.TryParse(dateString, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp))
+ {
+ date = temp;
+ }
+ }
+ else
+ {
+ dateString = "";
+ }
+ Field dateStringField = ds.GetField(DATE_FIELD, m_valType);
+ dateStringField.SetStringValue(dateString);
+ doc.Add(dateStringField);
+
+ if (date == null)
+ {
+ // just set to right now
+ date = DateTime.Now;
+ }
+
+ Field dateField = ds.GetNumericField(DATE_MSEC_FIELD, NumericType.INT64);
+ dateField.SetInt64Value(date.Value.Ticks);
+ doc.Add(dateField);
+
+ //util.cal.setTime(date);
+ //int sec = util.cal.get(Calendar.HOUR_OF_DAY) * 3600 + util.cal.get(Calendar.MINUTE) * 60 + util.cal.get(Calendar.SECOND);
+ int sec = Convert.ToInt32(date.Value.ToUniversalTime().TimeOfDay.TotalSeconds);
+
+ Field timeSecField = ds.GetNumericField(TIME_SEC_FIELD, NumericType.INT32);
+ timeSecField.SetInt32Value(sec);
+ doc.Add(timeSecField);
+
+ // Set TITLE_FIELD
+ string title = docData.Title;
+ Field titleField = ds.GetField(TITLE_FIELD, m_valType);
+ titleField.SetStringValue(title == null ? "" : title);
+ doc.Add(titleField);
+
+ string body = docData.Body;
+ if (body != null && body.Length > 0)
+ {
+ string bdy;
+ if (size <= 0 || size >= body.Length)
+ {
+ bdy = body; // use all
+ docData.Body = ""; // nothing left
+ }
+ else
+ {
+ // attempt not to break words - if whitespace found within next 20 chars...
+ for (int n = size - 1; n < size + 20 && n < body.Length; n++)
+ {
+ if (char.IsWhiteSpace(body[n]))
+ {
+ size = n;
+ break;
+ }
+ }
+ bdy = body.Substring(0, size - 0); // use part
+ docData.Body = body.Substring(size); // some left
+ }
+ Field bodyField = ds.GetField(BODY_FIELD, m_bodyValType);
+ bodyField.SetStringValue(bdy);
+ doc.Add(bodyField);
+
+ if (storeBytes)
+ {
+ Field bytesField = ds.GetField(BYTES_FIELD, StringField.TYPE_STORED);
+ bytesField.SetBytesValue(Encoding.UTF8.GetBytes(bdy));
+ doc.Add(bytesField);
+ }
+ }
+
+ if (m_indexProperties)
+ {
+ var props = docData.Props;
+ if (props != null)
+ {
+ foreach (var entry in props)
+ {
+ Field f = ds.GetField((string)entry.Key, m_valType);
+ f.SetStringValue((string)entry.Value);
+ doc.Add(f);
+ }
+ docData.Props = null;
+ }
+ }
+
+ //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
+ return doc;
+ }
+
+ private void ResetLeftovers()
+ {
+ leftovr.Value = null;
+ }
+
+ protected virtual DocState GetDocState()
+ {
+ DocState ds = docState.Value;
+ if (ds == null)
+ {
+ ds = new DocState(m_reuseFields, m_valType, m_bodyValType);
+ docState.Value = ds;
+ }
+ return ds;
+ }
+
+ /// <summary>
+ /// Closes the <see cref="DocMaker"/>.
+ /// </summary>
+ public void Dispose()
+ {
+ Dispose(true);
+ GC.SuppressFinalize(this);
+ }
+
+ /// <summary>
+ /// Closes the <see cref="DocMaker"/>. The base implementation closes the
+ /// <see cref="ContentSource"/>, and it can be overridden to do more work (but make
+ /// sure to call <c>base.Dispose(bool)</c>).
+ /// </summary>
+ protected virtual void Dispose(bool disposing)
+ {
+ if (disposing)
+ {
+ m_source.Dispose();
+ }
+ }
+
+ /// <summary>
+ /// Creates a <see cref="Document"/> object ready for indexing. This method uses the
+ /// <see cref="ContentSource"/> to get the next document from the source, and creates
+ /// a <see cref="Document"/> object from the returned fields. If
+ /// <c>reuseFields</c> was set to <c>true</c>, it will reuse <see cref="Document"/>
+ /// and <see cref="Field"/> instances.
+ /// </summary>
+ /// <returns></returns>
+ public virtual Document MakeDocument()
+ {
+ ResetLeftovers();
+ DocData docData = m_source.GetNextDocData(GetDocState().docData);
+ Document doc = CreateDocument(docData, 0, -1);
+ return doc;
+ }
+
+ /// <summary>
+ /// Same as <see cref="MakeDocument()"/>, only this method creates a document of the
+ /// given size input by <paramref name="size"/>.
+ /// </summary>
+ public virtual Document MakeDocument(int size)
+ {
+ LeftOver lvr = leftovr.Value;
+ if (lvr == null || lvr.DocData == null || lvr.DocData.Body == null
+ || lvr.DocData.Body.Length == 0)
+ {
+ ResetLeftovers();
+ }
+ DocData docData = GetDocState().docData;
+ DocData dd = (lvr == null ? m_source.GetNextDocData(docData) : lvr.DocData);
+ int cnt = (lvr == null ? 0 : lvr.Count);
+ while (dd.Body == null || dd.Body.Length < size)
+ {
+ DocData dd2 = dd;
+ dd = m_source.GetNextDocData(new DocData());
+ cnt = 0;
+ dd.Body = (dd2.Body + dd.Body);
+ }
+ Document doc = CreateDocument(dd, size, cnt);
+ if (dd.Body == null || dd.Body.Length == 0)
+ {
+ ResetLeftovers();
+ }
+ else
+ {
+ if (lvr == null)
+ {
+ lvr = new LeftOver();
+ leftovr.Value = lvr;
+ }
+ lvr.DocData = dd;
+ lvr.Count = ++cnt;
+ }
+ return doc;
+ }
+
+ /// <summary>Reset inputs so that the test run would behave, input wise, as if it just started.</summary>
+ public virtual void ResetInputs()
+ {
+ m_source.PrintStatistics("docs");
+ // re-initiate since properties by round may have changed.
+ SetConfig(m_config, m_source);
+ m_source.ResetInputs();
+ numDocsCreated.Set(0);
+ ResetLeftovers();
+ }
+
+ /// <summary>Set the configuration parameters of this doc maker.</summary>
+ public virtual void SetConfig(Config config, ContentSource source)
+ {
+ this.m_config = config;
+ this.m_source = source;
+
+ bool stored = config.Get("doc.stored", false);
+ bool bodyStored = config.Get("doc.body.stored", stored);
+ bool tokenized = config.Get("doc.tokenized", true);
+ bool bodyTokenized = config.Get("doc.body.tokenized", tokenized);
+ bool norms = config.Get("doc.tokenized.norms", false);
+ bool bodyNorms = config.Get("doc.body.tokenized.norms", true);
+ bool termVec = config.Get("doc.term.vector", false);
+ bool termVecPositions = config.Get("doc.term.vector.positions", false);
+ bool termVecOffsets = config.Get("doc.term.vector.offsets", false);
+
+ m_valType = new FieldType(TextField.TYPE_NOT_STORED);
+ m_valType.IsStored = stored;
+ m_valType.IsTokenized = tokenized;
+ m_valType.OmitNorms = !norms;
+ m_valType.StoreTermVectors = termVec;
+ m_valType.StoreTermVectorPositions = termVecPositions;
+ m_valType.StoreTermVectorOffsets = termVecOffsets;
+ m_valType.Freeze();
+
+ m_bodyValType = new FieldType(TextField.TYPE_NOT_STORED);
+ m_bodyValType.IsStored = bodyStored;
+ m_bodyValType.IsTokenized = bodyTokenized;
+ m_bodyValType.OmitNorms = !bodyNorms;
+ m_bodyValType.StoreTermVectors = termVec;
+ m_bodyValType.StoreTermVectorPositions = termVecPositions;
+ m_bodyValType.StoreTermVectorOffsets = termVecOffsets;
+ m_bodyValType.Freeze();
+
+ storeBytes = config.Get("doc.store.body.bytes", false);
+
+ m_reuseFields = config.Get("doc.reuse.fields", true);
+
+ // In a multi-rounds run, it is important to reset DocState since settings
+ // of fields may change between rounds, and this is the only way to reset
+ // the cache of all threads.
+ docState = new ThreadLocal<DocState>();
+
+ m_indexProperties = config.Get("doc.index.props", false);
+
+ updateDocIDLimit = config.Get("doc.random.id.limit", -1);
+ if (updateDocIDLimit != -1)
+ {
+ r = new Random(179);
+ }
+ }
+ }
+}