You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/08/06 17:59:02 UTC
[04/33] lucenenet git commit: Ported Lucene.Net.Benchmark + tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs b/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs
new file mode 100644
index 0000000..85dceda
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs
@@ -0,0 +1,92 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+
+namespace Lucene.Net.Benchmarks.Quality.Trec
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Read topics of TREC 1MQ track.
+ /// <para/>
+ /// Expects this topic format -
+ /// <code>
+ /// qnum:qtext
+ /// </code>
+ /// Comment lines starting with '#' are ignored.
+ /// <para/>
+ /// All topics will have a single name value pair.
+ /// </summary>
+ public class Trec1MQReader
+ {
+ private string name;
+
+ /// <summary>
+ /// Constructor for Trec's 1MQ TopicsReader
+ /// </summary>
+ /// <param name="name">Name of name-value pair to set for all queries.</param>
+ public Trec1MQReader(string name)
+ : base()
+ {
+ this.name = name;
+ }
+
+ /// <summary>
+ /// Read quality queries from trec 1MQ format topics file.
+ /// </summary>
+ /// <param name="reader">where queries are read from.</param>
+ /// <returns>the result quality queries.</returns>
+ /// <exception cref="IOException">if cannot read the queries.</exception>
+ public virtual QualityQuery[] ReadQueries(TextReader reader)
+ {
+ IList<QualityQuery> res = new List<QualityQuery>();
+ string line;
+ try
+ {
+ while (null != (line = reader.ReadLine()))
+ {
+ line = line.Trim();
+ if (line.StartsWith("#", StringComparison.Ordinal))
+ {
+ continue;
+ }
+ // id
+ int k = line.IndexOf(':');
+ string id = line.Substring(0, k - 0).Trim();
+ // qtext
+ string qtext = line.Substring(k + 1).Trim();
+ // we got a topic!
+ IDictionary<string, string> fields = new Dictionary<string, string>();
+ fields[name] = qtext;
+ //System.out.println("id: "+id+" qtext: "+qtext+" line: "+line);
+ QualityQuery topic = new QualityQuery(id, fields);
+ res.Add(topic);
+ }
+ }
+ finally
+ {
+ reader.Dispose();
+ }
+ // sort result array (by ID)
+ QualityQuery[] qq = res.ToArray();
+ Array.Sort(qq);
+ return qq;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs b/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs
new file mode 100644
index 0000000..386b130
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs
@@ -0,0 +1,186 @@
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+
+namespace Lucene.Net.Benchmarks.Quality.Trec
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Judge if given document is relevant to given quality query, based on Trec format for judgements.
+ /// </summary>
+ public class TrecJudge : IJudge
+ {
+ IDictionary<string, QRelJudgement> judgements;
+
+ /// <summary>
+ /// Constructor from a reader.
+ /// </summary>
+ /// <remarks>
+ /// Expected input format:
+ /// <code>
+ /// qnum 0 doc-name is-relevant
+ /// </code>
+ /// Two sample lines:
+ /// <code>
+ /// 19 0 doc303 1
+ /// 19 0 doc7295 0
+ /// </code>
+ /// </remarks>
+ /// <param name="reader">Where judgments are read from.</param>
+ /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+ public TrecJudge(TextReader reader)
+ {
+ judgements = new Dictionary<string, QRelJudgement>();
+ QRelJudgement curr = null;
+ string zero = "0";
+ string line;
+
+ try
+ {
+ while (null != (line = reader.ReadLine()))
+ {
+ line = line.Trim();
+ if (line.Length == 0 || '#' == line[0])
+ {
+ continue;
+ }
+ StringTokenizer st = new StringTokenizer(line);
+ string queryID = st.NextToken();
+ st.NextToken();
+ string docName = st.NextToken();
+ bool relevant = !zero.Equals(st.NextToken(), StringComparison.Ordinal);
+ // LUCENENET: don't call st.NextToken() unless the condition fails.
+ Debug.Assert(!st.HasMoreTokens(), "wrong format: " + line + " next: " + (st.HasMoreTokens() ? st.NextToken() : ""));
+ if (relevant)
+ { // only keep relevant docs
+ if (curr == null || !curr.queryID.Equals(queryID, StringComparison.Ordinal))
+ {
+ if (!judgements.TryGetValue(queryID, out curr) || curr == null)
+ {
+ curr = new QRelJudgement(queryID);
+ judgements[queryID] = curr;
+ }
+ }
+ curr.AddRelevantDoc(docName);
+ }
+ }
+ }
+ finally
+ {
+ reader.Dispose();
+ }
+ }
+
+ // inherit javadocs
+ public virtual bool IsRelevant(string docName, QualityQuery query)
+ {
+ QRelJudgement qrj;// = judgements.get(query.getQueryID());
+ judgements.TryGetValue(query.QueryID, out qrj);
+ return qrj != null && qrj.IsRelevant(docName);
+ }
+
+ /// <summary>
+ /// Single Judgement of a trec quality query.
+ /// </summary>
+ private class QRelJudgement
+ {
+ internal string queryID;
+ private IDictionary<string, string> relevantDocs;
+
+ internal QRelJudgement(string queryID)
+ {
+ this.queryID = queryID;
+ relevantDocs = new HashMap<string, string>();
+ }
+
+ public virtual void AddRelevantDoc(string docName)
+ {
+ relevantDocs[docName] = docName;
+ }
+
+ internal virtual bool IsRelevant(string docName)
+ {
+ return relevantDocs.ContainsKey(docName);
+ }
+
+ public virtual int MaxRecall
+ {
+ get { return relevantDocs.Count; }
+ }
+ }
+
+ // inherit javadocs
+ public virtual bool ValidateData(QualityQuery[] qq, TextWriter logger)
+ {
+ IDictionary<string, QRelJudgement> missingQueries = new Dictionary<string, QRelJudgement>(judgements);
+ IList<string> missingJudgements = new List<string>();
+ for (int i = 0; i < qq.Length; i++)
+ {
+ string id = qq[i].QueryID;
+ if (missingQueries.ContainsKey(id))
+ {
+ missingQueries.Remove(id);
+ }
+ else
+ {
+ missingJudgements.Add(id);
+ }
+ }
+ bool isValid = true;
+ if (missingJudgements.Count > 0)
+ {
+ isValid = false;
+ if (logger != null)
+ {
+ logger.WriteLine("WARNING: " + missingJudgements.Count + " queries have no judgments! - ");
+ for (int i = 0; i < missingJudgements.Count; i++)
+ {
+ logger.WriteLine(" " + missingJudgements[i]);
+ }
+ }
+ }
+ if (missingQueries.Count > 0)
+ {
+ isValid = false;
+ if (logger != null)
+ {
+ logger.WriteLine("WARNING: " + missingQueries.Count + " judgments match no query! - ");
+ foreach (string id in missingQueries.Keys)
+ {
+ logger.WriteLine(" " + id);
+ }
+ }
+ }
+ return isValid;
+ }
+
+ // inherit javadocs
+ public virtual int MaxRecall(QualityQuery query)
+ {
+ QRelJudgement qrj;
+ if (judgements.TryGetValue(query.QueryID, out qrj) && qrj != null)
+ {
+ return qrj.MaxRecall;
+ }
+ return 0;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs b/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs
new file mode 100644
index 0000000..158386f
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs
@@ -0,0 +1,154 @@
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.Quality.Trec
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Read TREC topics.
+ /// </summary>
+ /// <remarks>
+ /// Expects this topic format -
+ /// <code>
+ /// <top>
+ /// <num> Number: nnn
+ ///
+ /// <title> title of the topic
+ ///
+ /// <desc> Description:
+ /// description of the topic
+ ///
+ /// <narr> Narrative:
+ /// "story" composed by assessors.
+ ///
+ /// </top>
+ /// </code>
+ /// Comment lines starting with '#' are ignored.
+ /// </remarks>
+ public class TrecTopicsReader
+ {
+ private static readonly string newline = Environment.NewLine;
+
+ /// <summary>
+ /// Constructor for Trec's TopicsReader
+ /// </summary>
+ public TrecTopicsReader()
+ : base()
+ {
+ }
+
+ /// <summary>
+ /// Read quality queries from trec format topics file.
+ /// </summary>
+ /// <param name="reader">where queries are read from.</param>
+ /// <returns>the result quality queries.</returns>
+ /// <exception cref="IOException">if cannot read the queries.</exception>
+ public virtual QualityQuery[] ReadQueries(TextReader reader)
+ {
+ IList<QualityQuery> res = new List<QualityQuery>();
+ StringBuilder sb;
+ try
+ {
+ while (null != (sb = Read(reader, "<top>", null, false, false)))
+ {
+ IDictionary<string, string> fields = new Dictionary<string, string>();
+ // id
+ sb = Read(reader, "<num>", null, true, false);
+ int k = sb.IndexOf(":");
+ string id = sb.ToString(k + 1, sb.Length - (k + 1)).Trim();
+ // title
+ sb = Read(reader, "<title>", null, true, false);
+ k = sb.IndexOf(">");
+ string title = sb.ToString(k + 1, sb.Length - (k + 1)).Trim();
+ // description
+ Read(reader, "<desc>", null, false, false);
+ sb.Length = 0;
+ string line = null;
+ while ((line = reader.ReadLine()) != null)
+ {
+ if (line.StartsWith("<narr>", StringComparison.Ordinal))
+ break;
+ if (sb.Length > 0) sb.Append(' ');
+ sb.Append(line);
+ }
+ string description = sb.ToString().Trim();
+ // narrative
+ sb.Length = 0;
+ while ((line = reader.ReadLine()) != null)
+ {
+ if (line.StartsWith("</top>", StringComparison.Ordinal))
+ break;
+ if (sb.Length > 0) sb.Append(' ');
+ sb.Append(line);
+ }
+ string narrative = sb.ToString().Trim();
+ // we got a topic!
+ fields["title"] = title;
+ fields["description"] = description;
+ fields["narrative"] = narrative;
+ QualityQuery topic = new QualityQuery(id, fields);
+ res.Add(topic);
+ }
+ }
+ finally
+ {
+ reader.Dispose();
+ }
+ // sort result array (by ID)
+ QualityQuery[] qq = res.ToArray();
+ Array.Sort(qq);
+ return qq;
+ }
+
+ // read until finding a line that starts with the specified prefix
+ private StringBuilder Read(TextReader reader, string prefix, StringBuilder sb, bool collectMatchLine, bool collectAll)
+ {
+ sb = (sb == null ? new StringBuilder() : sb);
+ string sep = "";
+ while (true)
+ {
+ string line = reader.ReadLine();
+ if (line == null)
+ {
+ return null;
+ }
+ if (line.StartsWith(prefix, StringComparison.Ordinal))
+ {
+ if (collectMatchLine)
+ {
+ sb.Append(sep + line);
+ sep = newline;
+ }
+ break;
+ }
+ if (collectAll)
+ {
+ sb.Append(sep + line);
+ sep = newline;
+ }
+ }
+ //System.out.println("read: "+sb);
+ return sb;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs b/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs
new file mode 100644
index 0000000..6e5cc0f
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs
@@ -0,0 +1,89 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Lucene.Net.Benchmarks.Quality.Utils
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Utility: extract doc names from an index
+ /// </summary>
+ public class DocNameExtractor
+ {
+ private readonly string docNameField;
+
+ /// <summary>
+ /// Constructor for <see cref="DocNameExtractor"/>.
+ /// </summary>
+ /// <param name="docNameField">name of the stored field containing the doc name.</param>
+ public DocNameExtractor(string docNameField)
+ {
+ this.docNameField = docNameField;
+ }
+
+ /// <summary>
+ /// Extract the name of the input doc from the index.
+ /// </summary>
+ /// <param name="searcher">access to the index.</param>
+ /// <param name="docid">ID of doc whose name is needed.</param>
+ /// <returns>the name of the input doc as extracted from the index.</returns>
+ /// <exception cref="System.IO.IOException">if cannot extract the doc name from the index.</exception>
+ public virtual string DocName(IndexSearcher searcher, int docid)
+ {
+ IList<string> name = new List<string>();
+ searcher.IndexReader.Document(docid, new StoredFieldVisitorAnonymousHelper(this, name));
+
+ return name.FirstOrDefault();
+ }
+
+ private class StoredFieldVisitorAnonymousHelper : StoredFieldVisitor
+ {
+ private readonly DocNameExtractor outerInstance;
+ private readonly IList<string> name;
+
+ public StoredFieldVisitorAnonymousHelper(DocNameExtractor outerInstance, IList<string> name)
+ {
+ this.outerInstance = outerInstance;
+ this.name = name;
+ }
+ public override void StringField(FieldInfo fieldInfo, string value)
+ {
+ name.Add(value);
+ }
+
+ public override Status NeedsField(FieldInfo fieldInfo)
+ {
+ if (name.Count > 0)
+ {
+ return Status.STOP;
+ }
+ else if (fieldInfo.Name.Equals(outerInstance.docNameField, StringComparison.Ordinal))
+ {
+ return Status.YES;
+ }
+ else
+ {
+ return Status.NO;
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs b/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs
new file mode 100644
index 0000000..062263a
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs
@@ -0,0 +1,152 @@
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Benchmarks.Quality.Utils
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Suggest Quality queries based on an index contents.
+ /// Utility class, used for making quality test benchmarks.
+ /// </summary>
+ public class QualityQueriesFinder
+ {
+ private static readonly string newline = Environment.NewLine;
+ private Store.Directory dir;
+
+ /// <summary>
+ /// Constructor over a directory containing the index.
+ /// </summary>
+ /// <param name="dir">Directory containing the index we search for the quality test.</param>
+ private QualityQueriesFinder(Store.Directory dir)
+ {
+ this.dir = dir;
+ }
+
+ /// <summary>
+ ///
+ /// </summary>
+ /// <param name="args">{index-dir}</param>
+ /// <exception cref="IOException">if cannot access the index.</exception>
+ public static void Main(string[] args)
+ {
+ if (args.Length < 1)
+ {
+ SystemConsole.Error.WriteLine("Usage: java QualityQueriesFinder <index-dir>");
+ Environment.Exit(1);
+ }
+ QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.Open(new DirectoryInfo(args[0])));
+ string[] q = qqf.BestQueries("body", 20);
+ for (int i = 0; i < q.Length; i++)
+ {
+ SystemConsole.WriteLine(newline + FormatQueryAsTrecTopic(i, q[i], null, null));
+ }
+ }
+
+ private string[] BestQueries(string field, int numQueries)
+ {
+ string[] words = BestTerms("body", 4 * numQueries);
+ int n = words.Length;
+ int m = n / 4;
+ string[] res = new string[m];
+ for (int i = 0; i < res.Length; i++)
+ {
+ res[i] = words[i] + " " + words[m + i] + " " + words[n - 1 - m - i] + " " + words[n - 1 - i];
+ //System.out.println("query["+i+"]: "+res[i]);
+ }
+ return res;
+ }
+
+ private static string FormatQueryAsTrecTopic(int qnum, string title, string description, string narrative)
+ {
+ return
+ "<top>" + newline +
+ "<num> Number: " + qnum + newline + newline +
+ "<title> " + (title == null ? "" : title) + newline + newline +
+ "<desc> Description:" + newline +
+ (description == null ? "" : description) + newline + newline +
+ "<narr> Narrative:" + newline +
+ (narrative == null ? "" : narrative) + newline + newline +
+ "</top>";
+ }
+
+ private string[] BestTerms(string field, int numTerms)
+ {
+ Util.PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
+ IndexReader ir = DirectoryReader.Open(dir);
+ try
+ {
+ int threshold = ir.MaxDoc / 10; // ignore words too common.
+ Terms terms = MultiFields.GetTerms(ir, field);
+ if (terms != null)
+ {
+ TermsEnum termsEnum = terms.GetIterator(null);
+ while (termsEnum.Next() != null)
+ {
+ int df = termsEnum.DocFreq;
+ if (df < threshold)
+ {
+ string ttxt = termsEnum.Term.Utf8ToString();
+ pq.InsertWithOverflow(new TermDf(ttxt, df));
+ }
+ }
+ }
+ }
+ finally
+ {
+ ir.Dispose();
+ }
+ string[] res = new string[pq.Count];
+ int i = 0;
+ while (pq.Count > 0)
+ {
+ TermDf tdf = pq.Pop();
+ res[i++] = tdf.word;
+ SystemConsole.WriteLine(i + ". word: " + tdf.df + " " + tdf.word);
+ }
+ return res;
+ }
+
+ private class TermDf
+ {
+ internal string word;
+ internal int df;
+ internal TermDf(string word, int freq)
+ {
+ this.word = word;
+ this.df = freq;
+ }
+ }
+
+ private class TermsDfQueue : Util.PriorityQueue<TermDf>
+ {
+ internal TermsDfQueue(int maxSize)
+ : base(maxSize)
+ {
+ }
+
+ protected override bool LessThan(TermDf tf1, TermDf tf2)
+ {
+ return tf1.df < tf2.df;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs b/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs
new file mode 100644
index 0000000..0711e86
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs
@@ -0,0 +1,76 @@
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.QueryParsers.Classic;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System.Threading;
+
+namespace Lucene.Net.Benchmarks.Quality.Utils
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simplistic quality query parser. A Lucene query is created by passing
+ /// the value of the specified <see cref="QualityQuery"/> name-value pair(s) into
+ /// a Lucene's <see cref="QueryParser"/> using <see cref="StandardAnalyzer"/>.
+ /// </summary>
+ public class SimpleQQParser : IQualityQueryParser
+ {
+ private string[] qqNames;
+ private string indexField;
+ ThreadLocal<QueryParser> queryParser = new ThreadLocal<QueryParser>();
+
+ /// <summary>
+ /// Constructor of a simple qq parser.
+ /// </summary>
+ /// <param name="qqNames">Name-value pairs of quality query to use for creating the query.</param>
+ /// <param name="indexField">Corresponding index field.</param>
+ public SimpleQQParser(string[] qqNames, string indexField)
+ {
+ this.qqNames = qqNames;
+ this.indexField = indexField;
+ }
+
+ /// <summary>
+ /// Constructor of a simple qq parser.
+ /// </summary>
+ /// <param name="qqName">Name-value pair of quality query to use for creating the query.</param>
+ /// <param name="indexField">Corresponding index field.</param>
+ public SimpleQQParser(string qqName, string indexField)
+ : this(new string[] { qqName }, indexField)
+ {
+ }
+
+ /// <seealso cref="IQualityQueryParser.Parse(QualityQuery)"/>
+ public virtual Query Parse(QualityQuery qq)
+ {
+ QueryParser qp = queryParser.Value;
+ if (qp == null)
+ {
+#pragma warning disable 612, 618
+ qp = new QueryParser(LuceneVersion.LUCENE_CURRENT, indexField, new StandardAnalyzer(LuceneVersion.LUCENE_CURRENT));
+#pragma warning restore 612, 618
+ queryParser.Value = qp;
+ }
+ BooleanQuery bq = new BooleanQuery();
+ for (int i = 0; i < qqNames.Length; i++)
+ bq.Add(qp.Parse(QueryParserBase.Escape(qq.GetValue(qqNames[i]))), Occur.SHOULD);
+
+ return bq;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs b/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs
new file mode 100644
index 0000000..c31eddc
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs
@@ -0,0 +1,98 @@
+using Lucene.Net.Search;
+using System;
+using System.Globalization;
+using System.IO;
+
+namespace Lucene.Net.Benchmarks.Quality.Utils
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Create a log ready for submission.
+ /// Extend this class and override
+ /// <see cref="Report(QualityQuery, TopDocs, string, IndexSearcher)"/>
+ /// to create different reports.
+ /// </summary>
+ public class SubmissionReport
+ {
+ //private NumberFormat nf;
+ private string nf;
+ private TextWriter logger;
+ private string name;
+
+ /// <summary>
+ /// Constructor for <see cref="SubmissionReport"/>.
+ /// </summary>
+ /// <param name="logger">If <c>null</c>, no submission data is created.</param>
+ /// <param name="name">Name of this run.</param>
+ public SubmissionReport(TextWriter logger, string name)
+ {
+ this.logger = logger;
+ this.name = name;
+ nf = "{0:F4}";
+ }
+
+ /// <summary>
+ /// Report a search result for a certain quality query.
+ /// </summary>
+ /// <param name="qq">quality query for which the results are reported.</param>
+ /// <param name="td">search results for the query.</param>
+ /// <param name="docNameField">stored field used for fetching the result doc name.</param>
+ /// <param name="searcher">index access for fetching doc name.</param>
+ /// <see cref="IOException">in case of a problem.</see>
+ public virtual void Report(QualityQuery qq, TopDocs td, string docNameField, IndexSearcher searcher)
+ {
+ if (logger == null)
+ {
+ return;
+ }
+ ScoreDoc[] sd = td.ScoreDocs;
+ string sep = " \t ";
+ DocNameExtractor xt = new DocNameExtractor(docNameField);
+ for (int i = 0; i < sd.Length; i++)
+ {
+ string docName = xt.DocName(searcher, sd[i].Doc);
+ logger.WriteLine(
+ qq.QueryID + sep +
+ "Q0" + sep +
+ Format(docName, 20) + sep +
+ Format("" + i, 7) + sep +
+ //nf.format(sd[i].score) + sep +
+ string.Format(nf, sd[i].Score, CultureInfo.InvariantCulture) + sep +
+ name
+ );
+ }
+ }
+
+ public virtual void Flush()
+ {
+ if (logger != null)
+ {
+ logger.Flush();
+ }
+ }
+
+ private static string padd = " ";
+ private string Format(string s, int minLen)
+ {
+ s = (s == null ? "" : s);
+ int n = Math.Max(minLen, s.Length);
+ return (s + padd).Substring(0, n - 0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs b/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs
new file mode 100644
index 0000000..8727fa0
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs
@@ -0,0 +1,167 @@
+using Lucene.Net.Support;
+using System;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Benchmarks.Utils
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
+ /// </summary>
+ public class ExtractReuters
+ {
+ private DirectoryInfo reutersDir;
+ private DirectoryInfo outputDir;
+ private static readonly string LINE_SEPARATOR = Environment.NewLine;
+
+ public ExtractReuters(DirectoryInfo reutersDir, DirectoryInfo outputDir)
+ {
+ this.reutersDir = reutersDir;
+ this.outputDir = outputDir;
+ SystemConsole.WriteLine("Deleting all files in " + outputDir);
+ foreach (FileInfo f in outputDir.EnumerateFiles())
+ {
+ f.Delete();
+ }
+ }
+
+ public virtual void Extract()
+ {
+ FileInfo[] sgmFiles = reutersDir.GetFiles("*.sgm");
+ if (sgmFiles != null && sgmFiles.Length > 0)
+ {
+ foreach (FileInfo sgmFile in sgmFiles)
+ {
+ ExtractFile(sgmFile);
+ }
+ }
+ else
+ {
+ SystemConsole.Error.WriteLine("No .sgm files in " + reutersDir);
+ }
+ }
+
+ internal Regex EXTRACTION_PATTERN = new Regex("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>", RegexOptions.Compiled);
+
+ private static string[] META_CHARS = { "&", "<", ">", "\"", "'" };
+
+ private static string[] META_CHARS_SERIALIZATIONS = { "&", "<",
+ ">", """, "'" };
+
+ /// <summary>
+ /// Override if you wish to change what is extracted
+ /// </summary>
+ protected virtual void ExtractFile(FileInfo sgmFile)
+ {
+ try
+ {
+ using (TextReader reader = new StreamReader(new FileStream(sgmFile.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8))
+ {
+ StringBuilder buffer = new StringBuilder(1024);
+ StringBuilder outBuffer = new StringBuilder(1024);
+
+ string line = null;
+ int docNumber = 0;
+ while ((line = reader.ReadLine()) != null)
+ {
+ // when we see a closing reuters tag, flush the file
+
+ if (line.IndexOf("</REUTERS") == -1)
+ {
+ // Replace the SGM escape sequences
+
+ buffer.Append(line).Append(' ');// accumulate the strings for now,
+ // then apply regular expression to
+ // get the pieces,
+ }
+ else
+ {
+ // Extract the relevant pieces and write to a file in the output dir
+ Match matcher = EXTRACTION_PATTERN.Match(buffer.ToString());
+ if (matcher.Success)
+ {
+ do
+ {
+ for (int i = 1; i <= matcher.Groups.Count; i++)
+ {
+ if (matcher.Groups[i] != null)
+ {
+ outBuffer.Append(matcher.Groups[i].Value);
+ }
+ }
+ outBuffer.Append(LINE_SEPARATOR).Append(LINE_SEPARATOR);
+ } while ((matcher = matcher.NextMatch()).Success);
+ }
+
+ string @out = outBuffer.ToString();
+ for (int i = 0; i < META_CHARS_SERIALIZATIONS.Length; i++)
+ {
+ @out = @out.Replace(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
+ }
+ string outFile = System.IO.Path.Combine(outputDir.FullName, sgmFile.Name + "-"
+ + (docNumber++) + ".txt");
+ // System.out.println("Writing " + outFile);
+ StreamWriter writer = new StreamWriter(new FileStream(outFile, FileMode.Create, FileAccess.Write), Encoding.UTF8);
+ writer.Write(@out);
+ writer.Dispose();
+ outBuffer.Length = 0;
+ buffer.Length = 0;
+ }
+ }
+ }
+ }
+ catch (IOException e)
+ {
+ throw new Exception(e.ToString(), e);
+ }
+ }
+
+ public static void Main(string[] args)
+ {
+ if (args.Length != 2)
+ {
+ Usage("Wrong number of arguments (" + args.Length + ")");
+ return;
+ }
+ DirectoryInfo reutersDir = new DirectoryInfo(args[0]);
+ if (!reutersDir.Exists)
+ {
+ Usage("Cannot find Path to Reuters SGM files (" + reutersDir + ")");
+ return;
+ }
+
+ // First, extract to a tmp directory and only if everything succeeds, rename
+ // to output directory.
+ DirectoryInfo outputDir = new DirectoryInfo(args[1]);
+ outputDir = new DirectoryInfo(outputDir.FullName + "-tmp");
+ outputDir.Create();
+ ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
+ extractor.Extract();
+ // Now rename to requested output dir
+ outputDir.MoveTo(args[1]);
+ }
+
+ private static void Usage(string msg)
+ {
+ SystemConsole.Error.WriteLine("Usage: " + msg + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs b/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs
new file mode 100644
index 0000000..b61fbc5
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs
@@ -0,0 +1,178 @@
+using Lucene.Net.Benchmarks.ByTask.Feeds;
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Documents;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.Utils
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Extract the downloaded Wikipedia dump into separate files for indexing.
+ /// </summary>
+ public class ExtractWikipedia
+ {
+ private DirectoryInfo outputDir;
+
+ public static int count = 0;
+
+ internal static readonly int BASE = 10;
+ protected DocMaker m_docMaker;
+
+ public ExtractWikipedia(DocMaker docMaker, DirectoryInfo outputDir)
+ {
+ this.outputDir = outputDir;
+ this.m_docMaker = docMaker;
+ SystemConsole.WriteLine("Deleting all files in " + outputDir);
+ FileInfo[] files = outputDir.GetFiles();
+ for (int i = 0; i < files.Length; i++)
+ {
+ files[i].Delete();
+ }
+ }
+
+ public virtual DirectoryInfo Directory(int count, DirectoryInfo directory)
+ {
+ if (directory == null)
+ {
+ directory = outputDir;
+ }
+ int @base = BASE;
+ while (@base <= count)
+ {
+ @base *= BASE;
+ }
+ if (count < BASE)
+ {
+ return directory;
+ }
+ directory = new DirectoryInfo(System.IO.Path.Combine(directory.FullName, (((int)(@base / BASE)).ToString(CultureInfo.InvariantCulture))));
+ directory = new DirectoryInfo(System.IO.Path.Combine(directory.FullName, (((int)(count / (@base / BASE))).ToString(CultureInfo.InvariantCulture))));
+ return Directory(count % (@base / BASE), directory);
+ }
+
+ public virtual void Create(string id, string title, string time, string body)
+ {
+ DirectoryInfo d = Directory(count++, null);
+ d.Create();
+ FileInfo f = new FileInfo(System.IO.Path.Combine(d.FullName, id + ".txt"));
+
+ StringBuilder contents = new StringBuilder();
+
+ contents.Append(time);
+ contents.Append("\n\n");
+ contents.Append(title);
+ contents.Append("\n\n");
+ contents.Append(body);
+ contents.Append("\n");
+
+ try
+ {
+ using (TextWriter writer = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8))
+ writer.Write(contents.ToString());
+ }
+ catch (IOException ioe)
+ {
+ throw new Exception(ioe.ToString(), ioe);
+ }
+ }
+
+ public virtual void Extract()
+ {
+ Document doc = null;
+ SystemConsole.WriteLine("Starting Extraction");
+ long start = Support.Time.CurrentTimeMilliseconds();
+ try
+ {
+ while ((doc = m_docMaker.MakeDocument()) != null)
+ {
+ Create(doc.Get(DocMaker.ID_FIELD), doc.Get(DocMaker.TITLE_FIELD), doc
+ .Get(DocMaker.DATE_FIELD), doc.Get(DocMaker.BODY_FIELD));
+ }
+ }
+ catch (NoMoreDataException /*e*/)
+ {
+ //continue
+ }
+ long finish = Support.Time.CurrentTimeMilliseconds();
+ SystemConsole.WriteLine("Extraction took " + (finish - start) + " ms");
+ }
+
+ public static void Main(string[] args)
+ {
+
+ FileInfo wikipedia = null;
+ DirectoryInfo outputDir = new DirectoryInfo("./enwiki");
+ bool keepImageOnlyDocs = true;
+ for (int i = 0; i < args.Length; i++)
+ {
+ string arg = args[i];
+ if (arg.Equals("--input", StringComparison.Ordinal) || arg.Equals("-i", StringComparison.Ordinal))
+ {
+ wikipedia = new FileInfo(args[i + 1]);
+ i++;
+ }
+ else if (arg.Equals("--output", StringComparison.Ordinal) || arg.Equals("-o", StringComparison.Ordinal))
+ {
+ outputDir = new DirectoryInfo(args[i + 1]);
+ i++;
+ }
+ else if (arg.Equals("--discardImageOnlyDocs", StringComparison.Ordinal) || arg.Equals("-d", StringComparison.Ordinal))
+ {
+ keepImageOnlyDocs = false;
+ }
+ }
+
+ IDictionary<string, string> properties = new Dictionary<string, string>();
+ properties["docs.file"] = wikipedia.FullName;
+ properties["content.source.forever"] = "false";
+ properties["keep.image.only.docs"] = keepImageOnlyDocs.ToString();
+ Config config = new Config(properties);
+
+ ContentSource source = new EnwikiContentSource();
+ source.SetConfig(config);
+
+ DocMaker docMaker = new DocMaker();
+ docMaker.SetConfig(config, source);
+ docMaker.ResetInputs();
+ if (wikipedia.Exists)
+ {
+ SystemConsole.WriteLine("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource");
+ outputDir.Create();
+ ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir);
+ extractor.Extract();
+ }
+ else
+ {
+ PrintUsage();
+ }
+ }
+
+ private static void PrintUsage()
+ {
+ SystemConsole.Error.WriteLine("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input|-i <Path to Wikipedia XML file> " +
+ "[--output|-o <Output Path>] [--discardImageOnlyDocs|-d]");
+ SystemConsole.Error.WriteLine("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Benchmark/project.json b/src/Lucene.Net.Benchmark/project.json
new file mode 100644
index 0000000..adac6d5
--- /dev/null
+++ b/src/Lucene.Net.Benchmark/project.json
@@ -0,0 +1,53 @@
+{
+ "version": "4.8.0",
+ "title": "Lucene.Net.Benchmark",
+ "description": "System for benchmarking the Lucene.Net full-text search engine library from The Apache Software Foundation.",
+ "authors": [ "The Apache Software Foundation" ],
+ "packOptions": {
+ "projectUrl": "http://lucenenet.apache.org/",
+ "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt",
+ "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true",
+ "owners": [ "The Apache Software Foundation" ],
+ "repository": { "url": "https://github.com/apache/lucenenet" },
+ "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query" ]
+ },
+ "buildOptions": {
+ "compile": {
+ "includeFiles": [ "../CommonAssemblyInfo.cs" ]
+ },
+ "nowarn": [ "1591", "1573" ]
+ },
+ "dependencies": {
+ "icu.net": "54.1.1-alpha",
+ "Lucene.Net": "4.8.0",
+ "Lucene.Net.Analysis.Common": "4.8.0",
+ "Lucene.Net.Facet": "4.8.0",
+ "Lucene.Net.Highlighter": "4.8.0",
+ "Lucene.Net.ICU": "4.8.0",
+ "Lucene.Net.Queries": "4.8.0",
+ "Lucene.Net.QueryParser": "4.8.0",
+ "Lucene.Net.Spatial": "4.8.0",
+ "Sax.Net": "2.0.2",
+ "SharpZipLib": "0.86.0",
+ "Spatial4n.Core": "0.4.1-beta00003",
+ "TagSoup.Net": "1.2.1.1"
+ },
+ "frameworks": {
+ "netstandard1.5": {
+ "imports": "dnxcore50",
+ "buildOptions": {
+ "debugType": "portable",
+ "define": [ "NETSTANDARD" ]
+ },
+ "dependencies": {
+ "NETStandard.Library": "1.6.0"
+ }
+ },
+ "net451": {
+ "buildOptions": {
+ "debugType": "full",
+ "define": [ "FEATURE_SERIALIZABLE" ]
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.TestFramework/Util/TestUtil.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.TestFramework/Util/TestUtil.cs b/src/Lucene.Net.TestFramework/Util/TestUtil.cs
index e7eb247..bfc73dd 100644
--- a/src/Lucene.Net.TestFramework/Util/TestUtil.cs
+++ b/src/Lucene.Net.TestFramework/Util/TestUtil.cs
@@ -148,11 +148,20 @@ namespace Lucene.Net.Util
{
foreach (var entry in zip.Entries)
{
+ // Ignore internal folders - these are tacked onto the FullName anyway
+ if (entry.FullName.EndsWith("/", StringComparison.Ordinal) || entry.FullName.EndsWith("\\", StringComparison.Ordinal))
+ {
+ continue;
+ }
using (Stream input = entry.Open())
{
- FileInfo targetFile = new FileInfo(Path.Combine(destDir.FullName, entry.FullName));
+ FileInfo targetFile = new FileInfo(CorrectPath(Path.Combine(destDir.FullName, entry.FullName)));
+ if (!targetFile.Directory.Exists)
+ {
+ targetFile.Directory.Create();
+ }
- using (Stream output = new FileStream(targetFile.FullName, FileMode.OpenOrCreate, FileAccess.Write))
+ using (Stream output = new FileStream(targetFile.FullName, FileMode.Create, FileAccess.Write))
{
input.CopyTo(output);
}
@@ -161,6 +170,15 @@ namespace Lucene.Net.Util
}
}
+ private static string CorrectPath(string input)
+ {
+ if (Path.DirectorySeparatorChar.Equals('/'))
+ {
+ return input.Replace('\\', '/');
+ }
+ return input.Replace('/', '\\');
+ }
+
public static void SyncConcurrentMerges(IndexWriter writer)
{
SyncConcurrentMerges(writer.Config.MergeScheduler);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs b/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs
new file mode 100644
index 0000000..8981ee0
--- /dev/null
+++ b/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs
@@ -0,0 +1,129 @@
+using Lucene.Net.Benchmarks.ByTask;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Base class for all Benchmark unit tests.
+ /// </summary>
+ public abstract class BenchmarkTestCase : LuceneTestCase
+ {
+ private static DirectoryInfo WORKDIR;
+
+ public override void BeforeClass()
+ {
+ base.BeforeClass();
+ WORKDIR = CreateTempDir("benchmark");
+ // LUCENENET: Our directory numbers are sequential. Doing a delete
+ // here will make threads collide.
+ //WORKDIR.Delete();
+ //WORKDIR.Create();
+
+ propLines = new string[] {
+ "work.dir=" + getWorkDirPath(),
+ "directory=RAMDirectory",
+ "print.props=false",
+ };
+ }
+
+ public override void AfterClass()
+ {
+ WORKDIR = null;
+ base.AfterClass();
+ }
+
+
+ public DirectoryInfo getWorkDir()
+ {
+ return WORKDIR;
+ }
+
+ /** Copy a resource into the workdir */
+ public void copyToWorkDir(string resourceName)
+ {
+ Stream resource = GetType().getResourceAsStream(resourceName);
+ Stream dest = new FileStream(System.IO.Path.Combine(getWorkDir().FullName, resourceName), FileMode.Create, FileAccess.Write);
+ byte[] buffer = new byte[8192];
+ int len;
+
+ while ((len = resource.Read(buffer, 0, buffer.Length)) > 0)
+ {
+ dest.Write(buffer, 0, len);
+ }
+
+ resource.Dispose();
+ dest.Dispose();
+ }
+
+ /** Return a path, suitable for a .alg config file, for a resource in the workdir */
+ public String getWorkDirResourcePath(String resourceName)
+ {
+ return System.IO.Path.Combine(getWorkDir().FullName, resourceName).Replace("\\", "/");
+ }
+
+ /** Return a path, suitable for a .alg config file, for the workdir */
+ public String getWorkDirPath()
+ {
+ return getWorkDir().FullName.Replace("\\", "/");
+ }
+
+ // create the benchmark and execute it.
+ public Benchmark execBenchmark(String[] algLines)
+ {
+ String algText = algLinesToText(algLines);
+ logTstLogic(algText);
+ Benchmark benchmark = new Benchmark(new StringReader(algText));
+ benchmark.Execute();
+ return benchmark;
+ }
+
+ // properties in effect in all tests here
+ String[] propLines;
+
+ static readonly String NEW_LINE = Environment.NewLine;
+
+ // catenate alg lines to make the alg text
+ private String algLinesToText(String[] algLines)
+ {
+ String indent = " ";
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < propLines.Length; i++)
+ {
+ sb.append(indent).append(propLines[i]).append(NEW_LINE);
+ }
+ for (int i = 0; i < algLines.Length; i++)
+ {
+ sb.append(indent).append(algLines[i]).append(NEW_LINE);
+ }
+ return sb.toString();
+ }
+
+ private static void logTstLogic(String txt)
+ {
+ if (!VERBOSE)
+ return;
+ Console.WriteLine("Test logic of:");
+ Console.WriteLine(txt);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs
new file mode 100644
index 0000000..301c807
--- /dev/null
+++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs
@@ -0,0 +1,193 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Benchmarks.ByTask.Tasks;
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests the functionality of {@link DocMaker}.
+ /// </summary>
+ public class DocMakerTest : BenchmarkTestCase
+ {
+ public sealed class OneDocSource : ContentSource
+ {
+ private bool finish = false;
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+
+ public override DocData GetNextDocData(DocData docData)
+ {
+ if (finish)
+ {
+ throw new NoMoreDataException();
+ }
+
+ docData.Body = ("body");
+ docData.SetDate("date");
+ docData.Title = ("title");
+ Dictionary<string, string> props = new Dictionary<string, string>();
+ props["key"] = "value";
+ docData.Props = props;
+ finish = true;
+
+ return docData;
+ }
+ }
+
+ private void doTestIndexProperties(bool setIndexProps,
+ bool indexPropsVal, int numExpectedResults)
+ {
+ Dictionary<string, string> props = new Dictionary<string, string>();
+
+ // Indexing configuration.
+ props["analyzer"] = typeof(WhitespaceAnalyzer).AssemblyQualifiedName;
+ props["content.source"] = typeof(OneDocSource).AssemblyQualifiedName;
+ props["directory"] = "RAMDirectory";
+ if (setIndexProps)
+ {
+ props["doc.index.props"] = indexPropsVal.ToString();
+ }
+
+ // Create PerfRunData
+ Config config = new Config(props);
+ PerfRunData runData = new PerfRunData(config);
+
+ TaskSequence tasks = new TaskSequence(runData, TestName, null, false);
+ tasks.AddTask(new CreateIndexTask(runData));
+ tasks.AddTask(new AddDocTask(runData));
+ tasks.AddTask(new CloseIndexTask(runData));
+ tasks.DoLogic();
+
+ IndexReader reader = DirectoryReader.Open(runData.Directory);
+ IndexSearcher searcher = NewSearcher(reader);
+ TopDocs td = searcher.Search(new TermQuery(new Term("key", "value")), 10);
+ assertEquals(numExpectedResults, td.TotalHits);
+ reader.Dispose();
+ }
+
+ private Document createTestNormsDocument(bool setNormsProp,
+ bool normsPropVal, bool setBodyNormsProp, bool bodyNormsVal)
+ {
+ Dictionary<string, string> props = new Dictionary<string, string>();
+
+ // Indexing configuration.
+ props["analyzer"] = typeof(WhitespaceAnalyzer).AssemblyQualifiedName;
+ props["directory"] = "RAMDirectory";
+ if (setNormsProp)
+ {
+ props["doc.tokenized.norms"] = normsPropVal.ToString(CultureInfo.InvariantCulture);
+ }
+ if (setBodyNormsProp)
+ {
+ props["doc.body.tokenized.norms"] = bodyNormsVal.ToString(CultureInfo.InvariantCulture);
+ }
+
+ // Create PerfRunData
+ Config config = new Config(props);
+
+ DocMaker dm = new DocMaker();
+ dm.SetConfig(config, new OneDocSource());
+ return dm.MakeDocument();
+ }
+
+ /* Tests doc.index.props property. */
+ [Test]
+ public void TestIndexProperties()
+ {
+ // default is to not index properties.
+ doTestIndexProperties(false, false, 0);
+
+ // set doc.index.props to false.
+ doTestIndexProperties(true, false, 0);
+
+ // set doc.index.props to true.
+ doTestIndexProperties(true, true, 1);
+ }
+
+ /* Tests doc.tokenized.norms and doc.body.tokenized.norms properties. */
+ [Test]
+ public void TestNorms()
+ {
+
+ Document doc;
+
+ // Don't set anything, use the defaults
+ doc = createTestNormsDocument(false, false, false, false);
+ assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms);
+ assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms);
+
+ // Set norms to false
+ doc = createTestNormsDocument(true, false, false, false);
+ assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms);
+ assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms);
+
+ // Set norms to true
+ doc = createTestNormsDocument(true, true, false, false);
+ assertFalse(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms);
+ assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms);
+
+ // Set body norms to false
+ doc = createTestNormsDocument(false, false, true, false);
+ assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms);
+ assertTrue(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms);
+
+ // Set body norms to true
+ doc = createTestNormsDocument(false, false, true, true);
+ assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms);
+ assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms);
+ }
+
+ [Test]
+ public void TestDocMakerLeak()
+ {
+ // DocMaker did not close its ContentSource if resetInputs was called twice,
+ // leading to a file handle leak.
+ FileInfo f = new FileInfo(Path.Combine(getWorkDir().FullName, "docMakerLeak.txt"));
+ TextWriter ps = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
+ ps.WriteLine("one title\t" + Time.CurrentTimeMilliseconds() + "\tsome content");
+ ps.Dispose();
+
+ Dictionary<string, string> props = new Dictionary<string, string>();
+ props["docs.file"] = f.FullName;
+ props["content.source.forever"] = "false";
+ Config config = new Config(props);
+
+ ContentSource source = new LineDocSource();
+ source.SetConfig(config);
+
+ DocMaker dm = new DocMaker();
+ dm.SetConfig(config, source);
+ dm.ResetInputs();
+ dm.ResetInputs();
+ dm.Dispose();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs
new file mode 100644
index 0000000..95ded38
--- /dev/null
+++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs
@@ -0,0 +1,194 @@
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ [Ignore("LUCENENET TODO: Never finishes")]
+ public class EnwikiContentSourceTest : LuceneTestCase
+ {
+ /** An EnwikiContentSource which works on a String and not files. */
+ private class StringableEnwikiSource : EnwikiContentSource
+ {
+
+
+ private readonly String docs;
+
+ public StringableEnwikiSource(String docs)
+ {
+ this.docs = docs;
+ }
+
+ protected override Stream OpenInputStream()
+ {
+ return new MemoryStream(Encoding.UTF8.GetBytes(docs));
+ }
+
+ }
+
+ private void assertDocData(DocData dd, String expName, String expTitle, String expBody, String expDate)
+ {
+ assertNotNull(dd);
+ assertEquals(expName, dd.Name);
+ assertEquals(expTitle, dd.Title);
+ assertEquals(expBody, dd.Body);
+ assertEquals(expDate, dd.Date);
+ }
+
+ private void assertNoMoreDataException(EnwikiContentSource stdm)
+ {
+ try
+ {
+ stdm.GetNextDocData(null);
+ fail("Expecting NoMoreDataException");
+ }
+#pragma warning disable 168
+ catch (NoMoreDataException e)
+#pragma warning restore 168
+ {
+ // expected
+ }
+ }
+
+ private readonly String PAGE1 =
+ " <page>\r\n" +
+ " <title>Title1</title>\r\n" +
+ " <ns>0</ns>\r\n" +
+ " <id>1</id>\r\n" +
+ " <revision>\r\n" +
+ " <id>11</id>\r\n" +
+ " <parentid>111</parentid>\r\n" +
+ " <timestamp>2011-09-14T11:35:09Z</timestamp>\r\n" +
+ " <contributor>\r\n" +
+ " <username>Mister1111</username>\r\n" +
+ " <id>1111</id>\r\n" +
+ " </contributor>\r\n" +
+ " <minor />\r\n" +
+ " <comment>/* Never mind */</comment>\r\n" +
+ " <text>Some text 1 here</text>\r\n" +
+ " </revision>\r\n" +
+ " </page>\r\n";
+
+ private readonly String PAGE2 =
+ " <page>\r\n" +
+ " <title>Title2</title>\r\n" +
+ " <ns>0</ns>\r\n" +
+ " <id>2</id>\r\n" +
+ " <revision>\r\n" +
+ " <id>22</id>\r\n" +
+ " <parentid>222</parentid>\r\n" +
+ " <timestamp>2022-09-14T22:35:09Z</timestamp>\r\n" +
+ " <contributor>\r\n" +
+ " <username>Mister2222</username>\r\n" +
+ " <id>2222</id>\r\n" +
+ " </contributor>\r\n" +
+ " <minor />\r\n" +
+ " <comment>/* Never mind */</comment>\r\n" +
+ " <text>Some text 2 here</text>\r\n" +
+ " </revision>\r\n" +
+ " </page>\r\n";
+
+ [Test]
+ public void TestOneDocument()
+ {
+ String docs =
+ "<mediawiki>\r\n" +
+ PAGE1 +
+ "</mediawiki>";
+
+ EnwikiContentSource source = createContentSource(docs, false);
+
+ DocData dd = source.GetNextDocData(new DocData());
+ assertDocData(dd, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000");
+
+
+ assertNoMoreDataException(source);
+ }
+
+ private EnwikiContentSource createContentSource(String docs, bool forever)
+ {
+
+ Dictionary<string, string> props = new Dictionary<string, string>();
+ props["print.props"] = "false";
+ props["content.source.forever"] = forever.ToString(CultureInfo.InvariantCulture);
+ Config config = new Config(props);
+
+ EnwikiContentSource source = new StringableEnwikiSource(docs);
+ source.SetConfig(config);
+
+ // doc-maker just for initiating content source inputs
+ DocMaker docMaker = new DocMaker();
+ docMaker.SetConfig(config, source);
+ docMaker.ResetInputs();
+ return source;
+ }
+
+ [Test]
+ public void TestTwoDocuments()
+ {
+ String docs =
+ "<mediawiki>\r\n" +
+ PAGE1 +
+ PAGE2 +
+ "</mediawiki>";
+
+ EnwikiContentSource source = createContentSource(docs, false);
+
+ DocData dd1 = source.GetNextDocData(new DocData());
+ assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000");
+
+ DocData dd2 = source.GetNextDocData(new DocData());
+ assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000");
+
+
+ assertNoMoreDataException(source);
+ }
+
+ [Test]
+ public void TestForever()
+ {
+ String docs =
+ "<mediawiki>\r\n" +
+ PAGE1 +
+ PAGE2 +
+ "</mediawiki>";
+
+ EnwikiContentSource source = createContentSource(docs, true);
+
+ // same documents several times
+ for (int i = 0; i < 3; i++)
+ {
+ DocData dd1 = source.GetNextDocData(new DocData());
+ assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000");
+
+ DocData dd2 = source.GetNextDocData(new DocData());
+ assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000");
+ // Don't test that NoMoreDataException is thrown, since the forever flag is turned on.
+ }
+
+ source.Dispose();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs
new file mode 100644
index 0000000..7cd27f1
--- /dev/null
+++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs
@@ -0,0 +1,271 @@
+using ICSharpCode.SharpZipLib.BZip2;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Benchmarks.ByTask.Tasks;
+using Lucene.Net.Benchmarks.ByTask.Utils;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Benchmarks.ByTask.Feeds
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests the functionality of {@link LineDocSource}.
+ /// </summary>
+ public class LineDocSourceTest : BenchmarkTestCase
+ {
+ //private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
+ private void createBZ2LineFile(FileInfo file, bool addHeader)
+ {
+ Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
+ @out = new BZip2OutputStream(@out); // csFactory.createCompressorOutputStream("bzip2", @out);
+ TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
+ writeDocsToFile(writer, addHeader, null);
+ writer.Dispose();
+ }
+
+ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<string, string> otherFields)
+ {
+ if (addHeader)
+ {
+ writer.Write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
+ writer.Write(WriteLineDocTask.SEP);
+ writer.Write(DocMaker.TITLE_FIELD);
+ writer.Write(WriteLineDocTask.SEP);
+ writer.Write(DocMaker.DATE_FIELD);
+ writer.Write(WriteLineDocTask.SEP);
+ writer.Write(DocMaker.BODY_FIELD);
+ if (otherFields != null)
+ {
+ // additional field names in the header
+ foreach (Object fn in otherFields.Keys)
+ {
+ writer.Write(WriteLineDocTask.SEP);
+ writer.Write(fn.toString());
+ }
+ }
+ writer.WriteLine();
+ }
+ StringBuilder doc = new StringBuilder();
+ doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
+ if (otherFields != null)
+ {
+ // additional field values in the doc line
+ foreach (Object fv in otherFields.Values)
+ {
+ doc.append(WriteLineDocTask.SEP).append(fv.toString());
+ }
+ }
+ writer.Write(doc.toString());
+ writer.WriteLine();
+ }
+
+ private void createRegularLineFile(FileInfo file, bool addHeader)
+ {
+ Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
+ TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
+ writeDocsToFile(writer, addHeader, null);
+ writer.Dispose();
+ }
+
+ private void createRegularLineFileWithMoreFields(FileInfo file, params String[] extraFields)
+ {
+ Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
+ TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
+ Dictionary<string, string> p = new Dictionary<string, string>();
+ foreach (String f in extraFields)
+ {
+ p[f] = f;
+ }
+ writeDocsToFile(writer, true, p);
+ writer.Dispose();
+ }
+
+ private void doIndexAndSearchTest(FileInfo file, Type lineParserClass, String storedField)
+ {
+ doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions
+ doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition
+ doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions
+ }
+
+ private void doIndexAndSearchTestWithRepeats(FileInfo file,
+ Type lineParserClass, int numAdds, String storedField)
+ {
+
+ IndexReader reader = null;
+ IndexSearcher searcher = null;
+ PerfRunData runData = null;
+ try
+ {
+ Dictionary<string, string> props = new Dictionary<string, string>();
+
+ // LineDocSource specific settings.
+ props["docs.file"] = file.FullName;
+ if (lineParserClass != null)
+ {
+ props["line.parser"] = lineParserClass.AssemblyQualifiedName;
+ }
+
+ // Indexing configuration.
+ props["analyzer"] = typeof(WhitespaceAnalyzer).AssemblyQualifiedName;
+ props["content.source"] = typeof(LineDocSource).AssemblyQualifiedName;
+ props["directory"] = "RAMDirectory";
+ props["doc.stored"] = "true";
+ props["doc.index.props"] = "true";
+
+ // Create PerfRunData
+ Config config = new Config(props);
+ runData = new PerfRunData(config);
+
+ TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
+ tasks.AddTask(new CreateIndexTask(runData));
+ for (int i = 0; i < numAdds; i++)
+ {
+ tasks.AddTask(new AddDocTask(runData));
+ }
+ tasks.AddTask(new CloseIndexTask(runData));
+ try
+ {
+ tasks.DoLogic();
+ }
+ finally
+ {
+ tasks.Dispose();
+ }
+
+ reader = DirectoryReader.Open(runData.Directory);
+ searcher = NewSearcher(reader);
+ TopDocs td = searcher.Search(new TermQuery(new Term("body", "body")), 10);
+ assertEquals(numAdds, td.TotalHits);
+ assertNotNull(td.ScoreDocs[0]);
+
+ if (storedField == null)
+ {
+ storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
+ }
+ assertEquals("Wrong field value", storedField, searcher.Doc(0).Get(storedField));
+ }
+ finally
+ {
+ IOUtils.Dispose(reader, runData);
+ }
+
+ }
+
+ /* Tests LineDocSource with a bzip2 input stream. */
+ [Test]
+ public void TestBZip2()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line.bz2"));
+ createBZ2LineFile(file, true);
+ doIndexAndSearchTest(file, null, null);
+ }
+
+ [Test]
+ public void TestBZip2NoHeaderLine()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line.bz2"));
+ createBZ2LineFile(file, false);
+ doIndexAndSearchTest(file, null, null);
+ }
+
+ [Test]
+ public void TestRegularFile()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
+ createRegularLineFile(file, true);
+ doIndexAndSearchTest(file, null, null);
+ }
+
+ [Test]
+ public void TestRegularFileSpecialHeader()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
+ createRegularLineFile(file, true);
+ doIndexAndSearchTest(file, typeof(HeaderLineParser), null);
+ }
+
+ [Test]
+ public void TestRegularFileNoHeaderLine()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
+ createRegularLineFile(file, false);
+ doIndexAndSearchTest(file, null, null);
+ }
+
+ [Test]
+ public void TestInvalidFormat()
+ {
+ String[]
+ testCases = new String[] {
+ "", // empty line
+ "title", // just title
+ "title" + WriteLineDocTask.SEP, // title + SEP
+ "title" + WriteLineDocTask.SEP + "body", // title + SEP + body
+ // note that title + SEP + body + SEP is a valid line, which results in an
+ // empty body
+ };
+
+ for (int i = 0; i < testCases.Length; i++)
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
+ TextWriter writer = new StreamWriter(new FileStream(file.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
+ writer.Write(testCases[i]);
+ writer.WriteLine();
+ writer.Dispose();
+ try
+ {
+ doIndexAndSearchTest(file, null, null);
+ fail("Some exception should have been thrown for: [" + testCases[i] + "]");
+ }
+#pragma warning disable 168
+ catch (Exception e)
+#pragma warning restore 168
+ {
+ // expected.
+ }
+ }
+ }
+
+ /** Doc Name is not part of the default header */
+ [Test]
+ public void TestWithDocsName()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
+ createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
+ doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD);
+ }
+
+ /** Use fields names that are not defined in Docmaker and so will go to Properties */
+ [Test]
+ public void TestWithProperties()
+ {
+ FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
+ String specialField = "mySpecialField";
+ createRegularLineFileWithMoreFields(file, specialField);
+ doIndexAndSearchTest(file, null, specialField);
+ }
+ }
+}