You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2006/12/27 16:05:37 UTC
svn commit: r490508 [1/3] - in
/incubator/lucene.net/trunk/C#/contrib/Highlighter.Net: ./ Highlighter.Net/
Test/
Author: aroush
Date: Wed Dec 27 07:05:35 2006
New Revision: 490508
URL: http://svn.apache.org/viewvc?view=rev&rev=490508
Log:
Highlighter.Net 2.0.0 project. Port from Java to C#
Added:
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/ABOUT.txt
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/HISTORY.txt
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/AssemblyInfo.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/DefaultEncoder.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Encoder.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Formatter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Fragmenter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/GradientFormatter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net-2.0.0.csproj
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.sln
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.xml
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/NullFragmenter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Package.html
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/QueryScorer.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/QueryTermExtractor.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Scorer.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/SimpleFragmenter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/SimpleHTMLEncoder.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/SimpleHTMLFormatter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/SpanGradientFormatter.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/TextFragment.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/TokenGroup.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/TokenSources.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/WeightedTerm.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/LICENSE.txt
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/AssemblyInfo.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/HighlighterTest.cs
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/Test-2.0.0.csproj
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/Test.nunit
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/Test.sln
incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Test/Test.xml
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/ABOUT.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/ABOUT.txt?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/ABOUT.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/ABOUT.txt Wed Dec 27 07:05:35 2006
@@ -0,0 +1 @@
+Highlighter.Net is a port of Java Highlighter to C#. The port from Java to C# of version 1.4.0 and 2.0 are done primary by George Aroush. To contact George Aroush please visit http://www.aroush.net/
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/HISTORY.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/HISTORY.txt?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/HISTORY.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/HISTORY.txt Wed Dec 27 07:05:35 2006
@@ -0,0 +1,11 @@
+Highlighter.Net History
+-----------------------
+
+
+27Dec06:
+ - Release: Highlighter.Net.2.0.0 build 000 "Alpha"
+ - Issues: Not fully tested / validated such that many of the NUnit tests are failing.
+
+
+05Jan05:
+ - Release: Highlighter.Net.1.4.0 RC1 build 001
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/AssemblyInfo.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/AssemblyInfo.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/AssemblyInfo.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,58 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+
+//
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+//
+[assembly: AssemblyTitle("")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("")]
+[assembly: AssemblyCopyright("")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+//
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Revision and Build Numbers
+// by using the '*' as shown below:
+
+[assembly: AssemblyVersion("2.0.0.000")]
+
+//
+// In order to sign your assembly you must specify a key to use. Refer to the
+// Microsoft .NET Framework documentation for more information on assembly signing.
+//
+// Use the attributes below to control which key is used for signing.
+//
+// Notes:
+// (*) If no key is specified, the assembly is not signed.
+// (*) KeyName refers to a key that has been installed in the Crypto Service
+// Provider (CSP) on your machine. KeyFile refers to a file which contains
+// a key.
+// (*) If the KeyFile and the KeyName values are both specified, the
+// following processing occurs:
+// (1) If the KeyName can be found in the CSP, that key is used.
+// (2) If the KeyName does not exist and the KeyFile does exist, the key
+// in the KeyFile is installed into the CSP and used.
+// (*) In order to create a KeyFile, you can use the sn.exe (Strong Name) utility.
+// When specifying the KeyFile, the location of the KeyFile should be
+// relative to the project output directory which is
+// %Project Directory%\obj\<configuration>. For example, if your KeyFile is
+// located in the project directory, you would specify the AssemblyKeyFile
+// attribute as [assembly: AssemblyKeyFile("..\\..\\mykey.snk")]
+// (*) Delay Signing is an advanced option - see the Microsoft .NET Framework
+// documentation for more information on this.
+//
+[assembly: AssemblyDelaySign(false)]
+[assembly: AssemblyKeyFile("")]
+[assembly: AssemblyKeyName("")]
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/DefaultEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/DefaultEncoder.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/DefaultEncoder.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/DefaultEncoder.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Highlight
+{
+ /// <summary> Simple {@link Encoder} implementation that does not modify the output</summary>
+ /// <author> Nicko Cadell
+ ///
+ /// </author>
+ public class DefaultEncoder : Encoder
+ {
+ public DefaultEncoder()
+ {
+ }
+
+ public virtual System.String EncodeText(System.String originalText)
+ {
+ return originalText;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Encoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Encoder.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Encoder.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Encoder.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Highlight
+{
+ /// <summary> Encodes original text. The Encoder works with the Formatter to generate the output.
+ ///
+ /// </summary>
+ /// <author> Nicko Cadell
+ /// </author>
+ public interface Encoder
+ {
+ /// <param name="originalText">The section of text being output
+ /// </param>
+ System.String EncodeText(System.String originalText);
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Formatter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Formatter.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Formatter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Formatter.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Highlight
+{
+ /// <summary> Processes terms found in the original text, typically by applying some form
+ /// of mark-up to highlight terms in HTML search results pages.
+ ///
+ /// </summary>
+ public interface Formatter
+ {
+ /// <param name="originalText">The section of text being considered for markup
+ /// </param>
+ /// <param name="tokenGroup">contains one or several overlapping Tokens along with
+ /// their scores and positions.
+ /// </param>
+ System.String HighlightTerm(System.String originalText, TokenGroup tokenGroup);
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Fragmenter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Fragmenter.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Fragmenter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Fragmenter.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Token = Lucene.Net.Analysis.Token;
+
+namespace Lucene.Net.Highlight
+{
+
+ /// <summary> Implements the policy for breaking text into multiple fragments for consideration
+ /// by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
+ /// of detecting end of sentences in the text.
+ /// </summary>
+ /// <author> mark@searcharea.co.uk
+ /// </author>
+ public interface Fragmenter
+ {
+ /// <summary> Initializes the Fragmenter</summary>
+ /// <param name="">originalText
+ /// </param>
+ void Start(System.String originalText);
+
+ /// <summary> Test to see if this token from the stream should be held in a new TextFragment</summary>
+ /// <param name="">nextToken
+ /// </param>
+ bool IsNewFragment(Token nextToken);
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/GradientFormatter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/GradientFormatter.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/GradientFormatter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/GradientFormatter.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Highlight
+{
+ /// <summary> Formats text with different color intensity depending on the score of the
+ /// term.
+ ///
+ /// </summary>
+ /// <author> maharwood
+ /// </author>
+ public class GradientFormatter : Formatter
+ {
+ private float maxScore;
+
+ internal int fgRMin, fgGMin, fgBMin;
+
+ internal int fgRMax, fgGMax, fgBMax;
+
+ protected internal bool highlightForeground;
+
+ internal int bgRMin, bgGMin, bgBMin;
+
+ internal int bgRMax, bgGMax, bgBMax;
+
+ protected internal bool highlightBackground;
+
+ /// <summary> Sets the color range for the IDF scores
+ ///
+ /// </summary>
+ /// <param name="">maxScore
+ /// The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight
+ /// which can be used to callibrate scoring scale)
+ /// </param>
+ /// <param name="">minForegroundColor
+ /// The hex color used for representing IDF scores of zero eg
+ /// #FFFFFF (white) or null if no foreground color required
+ /// </param>
+ /// <param name="">maxForegroundColor
+ /// The largest hex color used for representing IDF scores eg
+ /// #000000 (black) or null if no foreground color required
+ /// </param>
+ /// <param name="">minBackgroundColor
+ /// The hex color used for representing IDF scores of zero eg
+ /// #FFFFFF (white) or null if no background color required
+ /// </param>
+ /// <param name="">maxBackgroundColor
+ /// The largest hex color used for representing IDF scores eg
+ /// #000000 (black) or null if no background color required
+ /// </param>
+ public GradientFormatter(float maxScore, System.String minForegroundColor, System.String maxForegroundColor, System.String minBackgroundColor, System.String maxBackgroundColor)
+ {
+ highlightForeground = (minForegroundColor != null) && (maxForegroundColor != null);
+ if (highlightForeground)
+ {
+ if (minForegroundColor.Length != 7)
+ {
+ throw new System.ArgumentException("minForegroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ if (maxForegroundColor.Length != 7)
+ {
+ throw new System.ArgumentException("minForegroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ fgRMin = HexToInt(minForegroundColor.Substring(1, (3) - (1)));
+ fgGMin = HexToInt(minForegroundColor.Substring(3, (5) - (3)));
+ fgBMin = HexToInt(minForegroundColor.Substring(5, (7) - (5)));
+
+ fgRMax = HexToInt(maxForegroundColor.Substring(1, (3) - (1)));
+ fgGMax = HexToInt(maxForegroundColor.Substring(3, (5) - (3)));
+ fgBMax = HexToInt(maxForegroundColor.Substring(5, (7) - (5)));
+ }
+
+ highlightBackground = (minBackgroundColor != null) && (maxBackgroundColor != null);
+ if (highlightBackground)
+ {
+ if (minBackgroundColor.Length != 7)
+ {
+ throw new System.ArgumentException("minBackgroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ if (maxBackgroundColor.Length != 7)
+ {
+ throw new System.ArgumentException("minBackgroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ bgRMin = HexToInt(minBackgroundColor.Substring(1, (3) - (1)));
+ bgGMin = HexToInt(minBackgroundColor.Substring(3, (5) - (3)));
+ bgBMin = HexToInt(minBackgroundColor.Substring(5, (7) - (5)));
+
+ bgRMax = HexToInt(maxBackgroundColor.Substring(1, (3) - (1)));
+ bgGMax = HexToInt(maxBackgroundColor.Substring(3, (5) - (3)));
+ bgBMax = HexToInt(maxBackgroundColor.Substring(5, (7) - (5)));
+ }
+ // this.corpusReader = corpusReader;
+ this.maxScore = maxScore;
+ // totalNumDocs = corpusReader.numDocs();
+ }
+
+ public virtual System.String HighlightTerm(System.String originalText, TokenGroup tokenGroup)
+ {
+ if (tokenGroup.GetTotalScore() == 0)
+ return originalText;
+ float score = tokenGroup.GetTotalScore();
+ if (score == 0)
+ {
+ return originalText;
+ }
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ sb.Append("<font ");
+ if (highlightForeground)
+ {
+ sb.Append("color=\"");
+ sb.Append(GetForegroundColorString(score));
+ sb.Append("\" ");
+ }
+ if (highlightBackground)
+ {
+ sb.Append("bgcolor=\"");
+ sb.Append(GetBackgroundColorString(score));
+ sb.Append("\" ");
+ }
+ sb.Append(">");
+ sb.Append(originalText);
+ sb.Append("</font>");
+ return sb.ToString();
+ }
+
+ protected internal virtual System.String GetForegroundColorString(float score)
+ {
+ int rVal = GetColorVal(fgRMin, fgRMax, score);
+ int gVal = GetColorVal(fgGMin, fgGMax, score);
+ int bVal = GetColorVal(fgBMin, fgBMax, score);
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ sb.Append("#");
+ sb.Append(IntToHex(rVal));
+ sb.Append(IntToHex(gVal));
+ sb.Append(IntToHex(bVal));
+ return sb.ToString();
+ }
+
+ protected internal virtual System.String GetBackgroundColorString(float score)
+ {
+ int rVal = GetColorVal(bgRMin, bgRMax, score);
+ int gVal = GetColorVal(bgGMin, bgGMax, score);
+ int bVal = GetColorVal(bgBMin, bgBMax, score);
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ sb.Append("#");
+ sb.Append(IntToHex(rVal));
+ sb.Append(IntToHex(gVal));
+ sb.Append(IntToHex(bVal));
+ return sb.ToString();
+ }
+
+ private int GetColorVal(int colorMin, int colorMax, float score)
+ {
+ if (colorMin == colorMax)
+ {
+ return colorMin;
+ }
+ float scale = System.Math.Abs(colorMin - colorMax);
+ float relScorePercent = System.Math.Min(maxScore, score) / maxScore;
+ float colScore = scale * relScorePercent;
+ return System.Math.Min(colorMin, colorMax) + (int) colScore;
+ }
+
+ private static char[] hexDigits = new char[]{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+
+ private static System.String IntToHex(int i)
+ {
+ return "" + hexDigits[(i & 0xF0) >> 4] + hexDigits[i & 0x0F];
+ }
+
+ /// <summary> Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the
+ /// input is nonnegative unless there is a preceding minus sign. This method
+ /// reads the input as twos complement instead, so if the input is 8 bytes
+ /// long, it will correctly restore a negative int produced by
+ /// Integer.toHexString() but not neccesarily one produced by
+ /// Integer.toString(x,16) since that method will produce a string like '-FF'
+ /// for negative integer values.
+ ///
+ /// </summary>
+ /// <param name="">hex
+ /// A string in capital or lower case hex, of no more then 16
+ /// characters.
+ /// </param>
+ /// <throws> NumberFormatException </throws>
+ /// <summary> if the string is more than 16 characters long, or if any
+ /// character is not in the set [0-9a-fA-f]
+ /// </summary>
+ public static int HexToInt(System.String hex)
+ {
+ int len = hex.Length;
+ if (len > 16)
+ throw new System.FormatException();
+
+ int l = 0;
+ for (int i = 0; i < len; i++)
+ {
+ l <<= 4;
+ int c = (int) System.Char.GetNumericValue(hex[i]);
+ if (c < 0)
+ throw new System.FormatException();
+ l |= c;
+ }
+ return l;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net-2.0.0.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net-2.0.0.csproj?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net-2.0.0.csproj (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net-2.0.0.csproj Wed Dec 27 07:05:35 2006
@@ -0,0 +1,159 @@
+<VisualStudioProject>
+ <CSHARP
+ ProjectType = "Local"
+ ProductVersion = "7.10.3077"
+ SchemaVersion = "2.0"
+ ProjectGuid = "{901D5415-383C-4AA6-A256-879558841BEA}"
+ >
+ <Build>
+ <Settings
+ ApplicationIcon = ""
+ AssemblyKeyContainerName = ""
+ AssemblyName = "Highlighter.Net"
+ AssemblyOriginatorKeyFile = ""
+ DefaultClientScript = "JScript"
+ DefaultHTMLPageLayout = "Grid"
+ DefaultTargetSchema = "IE50"
+ DelaySign = "false"
+ OutputType = "Library"
+ PreBuildEvent = ""
+ PostBuildEvent = ""
+ RootNamespace = "Lucene.Net.Search.Highlight"
+ RunPostBuildEvent = "OnBuildSuccess"
+ StartupObject = ""
+ >
+ <Config
+ Name = "Debug"
+ AllowUnsafeBlocks = "false"
+ BaseAddress = "285212672"
+ CheckForOverflowUnderflow = "false"
+ ConfigurationOverrideFile = ""
+ DefineConstants = "DEBUG;TRACE"
+ DocumentationFile = "Highlighter.Net.xml"
+ DebugSymbols = "true"
+ FileAlignment = "4096"
+ IncrementalBuild = "true"
+ NoStdLib = "false"
+ NoWarn = ""
+ Optimize = "false"
+ OutputPath = "..\bin\Debug\"
+ RegisterForComInterop = "false"
+ RemoveIntegerChecks = "false"
+ TreatWarningsAsErrors = "false"
+ WarningLevel = "4"
+ />
+ <Config
+ Name = "Release"
+ AllowUnsafeBlocks = "false"
+ BaseAddress = "285212672"
+ CheckForOverflowUnderflow = "false"
+ ConfigurationOverrideFile = ""
+ DefineConstants = "TRACE"
+ DocumentationFile = ""
+ DebugSymbols = "false"
+ FileAlignment = "4096"
+ IncrementalBuild = "false"
+ NoStdLib = "false"
+ NoWarn = ""
+ Optimize = "true"
+ OutputPath = "..\bin\Release\"
+ RegisterForComInterop = "false"
+ RemoveIntegerChecks = "false"
+ TreatWarningsAsErrors = "false"
+ WarningLevel = "4"
+ />
+ </Settings>
+ <References>
+ <Reference
+ Name = "System"
+ AssemblyName = "System"
+ HintPath = "..\..\..\..\WINDOWS\Microsoft.NET\Framework\v1.0.3705\System.dll"
+ />
+ <Reference
+ Name = "System.Data"
+ AssemblyName = "System.Data"
+ HintPath = "..\..\..\..\WINDOWS\Microsoft.NET\Framework\v1.0.3705\System.Data.dll"
+ />
+ <Reference
+ Name = "System.XML"
+ AssemblyName = "System.Xml"
+ HintPath = "..\..\..\..\WINDOWS\Microsoft.NET\Framework\v1.0.3705\System.XML.dll"
+ />
+ <Reference
+ Name = "Lucene.Net"
+ AssemblyName = "Lucene.Net"
+ HintPath = "..\Lucene.Net.dll"
+ />
+ </References>
+ </Build>
+ <Files>
+ <Include>
+ <File
+ RelPath = "AssemblyInfo.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Formatter.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Fragmenter.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Highlighter.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Package.html"
+ BuildAction = "Content"
+ />
+ <File
+ RelPath = "QueryScorer.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "QueryTermExtractor.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Scorer.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "SimpleFragmenter.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "SimpleHTMLFormatter.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "TextFragment.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "TokenGroup.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "WeightedTerm.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ </Include>
+ </Files>
+ </CSHARP>
+</VisualStudioProject>
+
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.sln?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.sln (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.sln Wed Dec 27 07:05:35 2006
@@ -0,0 +1,24 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Highlighter.Net-2.0.0", "Highlighter.Net-2.0.0.csproj", "{901D5415-383C-4AA6-A256-879558841BEA}"
+ ProjectSection(ProjectDependencies) = postProject
+ EndProjectSection
+EndProject
+Global
+ GlobalSection(DPCodeReviewSolutionGUID) = preSolution
+ DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000}
+ EndGlobalSection
+ GlobalSection(SolutionConfiguration) = preSolution
+ Debug = Debug
+ Release = Release
+ EndGlobalSection
+ GlobalSection(ProjectConfiguration) = postSolution
+ {901D5415-383C-4AA6-A256-879558841BEA}.Debug.ActiveCfg = Debug|.NET
+ {901D5415-383C-4AA6-A256-879558841BEA}.Debug.Build.0 = Debug|.NET
+ {901D5415-383C-4AA6-A256-879558841BEA}.Release.ActiveCfg = Release|.NET
+ {901D5415-383C-4AA6-A256-879558841BEA}.Release.Build.0 = Release|.NET
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ EndGlobalSection
+ GlobalSection(ExtensibilityAddIns) = postSolution
+ EndGlobalSection
+EndGlobal
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.xml
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.xml?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.xml (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.Net.xml Wed Dec 27 07:05:35 2006
@@ -0,0 +1,580 @@
+<?xml version="1.0"?>
+<doc>
+ <assembly>
+ <name>Highlighter.Net</name>
+ </assembly>
+ <members>
+ <member name="T:Lucene.Net.Highlight.Formatter">
+ <summary> Processes terms found in the original text, typically by applying some form
+ of mark-up to highlight terms in HTML search results pages.
+
+ </summary>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Formatter.HighlightTerm(System.String,Lucene.Net.Highlight.TokenGroup)">
+ <param name="originalText">The section of text being considered for markup
+ </param>
+ <param name="tokenGroup">contains one or several overlapping Tokens along with
+ their scores and positions.
+ </param>
+ </member>
+ <member name="T:Lucene.Net.Highlight.Fragmenter">
+ <summary> Implements the policy for breaking text into multiple fragments for consideration
+ by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
+ of detecting end of sentences in the text.
+ </summary>
+ <author> mark@searcharea.co.uk
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Fragmenter.Start(System.String)">
+ <summary> Initializes the Fragmenter</summary>
+ <param name="">originalText
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Fragmenter.IsNewFragment(Lucene.Net.Analysis.Token)">
+ <summary> Test to see if this token from the stream should be held in a new TextFragment</summary>
+ <param name="">nextToken
+ </param>
+ </member>
+ <member name="T:Lucene.Net.Highlight.Highlighter">
+ <summary> Class used to markup highlighted terms found in the best sections of a
+ text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
+ {@link Encoder} and tokenizers.
+ </summary>
+ <author> mark@searcharea.co.uk
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestFragment(Lucene.Net.Analysis.Analyzer,System.String,System.String)">
+ <summary> Highlights chosen terms in a text, extracting the most relevant section.
+ This is a convenience method that calls
+ {@link #GetBestFragment(TokenStream, String)}
+
+ </summary>
+ <param name="analyzer"> the analyzer that will be used to split <code>text</code>
+ into chunks
+ </param>
+ <param name="text">text to highlight terms in
+ </param>
+ <param name="fieldName">Name of field used to influence analyzer's tokenization policy
+
+ </param>
+ <returns> highlighted text fragment or null if no terms found
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestFragment(Lucene.Net.Analysis.TokenStream,System.String)">
+ <summary> Highlights chosen terms in a text, extracting the most relevant section.
+ The document text is analysed in chunks to record hit statistics
+ across the document. After accumulating stats, the fragment with the highest score
+ is returned
+
+ </summary>
+ <param name="tokenStream"> a stream of tokens identified in the text parameter, including offset information.
+ This is typically produced by an analyzer re-parsing a document's
+ text. Some work may be done on retrieving TokenStreams more efficently
+ by adding support for storing original text position data in the Lucene
+ index but this support is not currently available (as of Lucene 1.4 rc2).
+ </param>
+ <param name="text">text to highlight terms in
+
+ </param>
+ <returns> highlighted text fragment or null if no terms found
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestFragments(Lucene.Net.Analysis.Analyzer,System.String,System.Int32)">
+ <summary> Highlights chosen terms in a text, extracting the most relevant sections.
+ This is a convenience method that calls
+ {@link #getBestFragments(TokenStream, String, int)}
+
+ </summary>
+ <param name="analyzer"> the analyzer that will be used to split <code>text</code>
+ into chunks
+ </param>
+ <param name="text"> text to highlight terms in
+ </param>
+ <param name="maxNumFragments"> the maximum number of fragments.
+ </param>
+ <deprecated> This method incorrectly hardcodes the choice of fieldname. Use the
+ method of the same name that takes a fieldname.
+ </deprecated>
+ <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestFragments(Lucene.Net.Analysis.Analyzer,System.String,System.String,System.Int32)">
+ <summary> Highlights chosen terms in a text, extracting the most relevant sections.
+ This is a convenience method that calls
+ {@link #getBestFragments(TokenStream, String, int)}
+
+ </summary>
+ <param name="analyzer"> the analyzer that will be used to split <code>text</code>
+ into chunks
+ </param>
+ <param name="fieldName"> the name of the field being highlighted (used by analyzer)
+ </param>
+ <param name="text"> text to highlight terms in
+ </param>
+ <param name="maxNumFragments"> the maximum number of fragments.
+
+ </param>
+ <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestFragments(Lucene.Net.Analysis.TokenStream,System.String,System.Int32)">
+ <summary> Highlights chosen terms in a text, extracting the most relevant sections.
+ The document text is analysed in chunks to record hit statistics
+ across the document. After accumulating stats, the fragments with the highest scores
+ are returned as an array of strings in order of score (contiguous fragments are merged into
+ one in their original order to improve readability)
+
+ </summary>
+ <param name="text"> text to highlight terms in
+ </param>
+ <param name="maxNumFragments"> the maximum number of fragments.
+
+ </param>
+ <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestTextFragments(Lucene.Net.Analysis.TokenStream,System.String,System.Boolean,System.Int32)">
+ <summary> Low level api to get the most relevant (formatted) sections of the document.
+ This method has been made public to allow visibility of score information held in TextFragment objects.
+ Thanks to Jason Calabrese for help in redefining the interface.
+ </summary>
+ <param name="">tokenStream
+ </param>
+ <param name="">text
+ </param>
+ <param name="">maxNumFragments
+ </param>
+ <param name="">mergeContiguousFragments
+ </param>
+ <throws> IOException </throws>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.MergeContiguousFragments(Lucene.Net.Highlight.TextFragment[])">
+ <summary>Improves readability of a score-sorted list of TextFragments by merging any fragments
+ that were contiguous in the original text into one larger fragment with the correct order.
+ This will leave a "null" in the array entry for the lesser scored fragment.
+
+ </summary>
+ <param name="frag">An array of document fragments in descending score
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetBestFragments(Lucene.Net.Analysis.TokenStream,System.String,System.Int32,System.String)">
+ <summary> Highlights terms in the text , extracting the most relevant sections
+ and concatenating the chosen fragments with a separator (typically "...").
+ The document text is analysed in chunks to record hit statistics
+ across the document. After accumulating stats, the fragments with the highest scores
+ are returned in order as "separator" delimited strings.
+
+ </summary>
+ <param name="text"> text to highlight terms in
+ </param>
+ <param name="maxNumFragments"> the maximum number of fragments.
+ </param>
+ <param name="separator"> the separator used to intersperse the document fragments (typically "...")
+
+ </param>
+ <returns> highlighted text
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetMaxDocBytesToAnalyze">
+ <returns> the maximum number of bytes to be tokenized per doc
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.SetMaxDocBytesToAnalyze(System.Int32)">
+ <param name="byteCount">the maximum number of bytes to be tokenized per doc
+ (This can improve performance with large documents)
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.SetTextFragmenter(Lucene.Net.Highlight.Fragmenter)">
+ <param name="">fragmenter
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.GetFragmentScorer">
+ <returns> Object used to score each text fragment
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Highlighter.SetFragmentScorer(Lucene.Net.Highlight.Scorer)">
+ <param name="">scorer
+ </param>
+ </member>
+ <member name="T:Lucene.Net.Highlight.QueryScorer">
+ <summary> {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
+ This class uses the {@link QueryTermExtractor} class to process determine the query terms and
+ their boosts to be used.
+ </summary>
+ <author> mark@searcharea.co.uk
+ </author>
+ </member>
+ <member name="T:Lucene.Net.Highlight.Scorer">
+ <summary> Adds to the score for a fragment based on its tokens</summary>
+ <author> mark@searcharea.co.uk
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Scorer.StartFragment(Lucene.Net.Highlight.TextFragment)">
+ <summary> called when a new fragment is started for consideration</summary>
+ <param name="">newFragment
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Scorer.GetTokenScore(Lucene.Net.Analysis.Token)">
+ <summary> Called for each token in the current fragment</summary>
+ <param name="token">The token to be scored
+ </param>
+ <returns> a score which is passed to the Highlighter class to influence the mark-up of the text
+ (this return value is NOT used to score the fragment)
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Scorer.GetFragmentScore">
+ <summary> Called when the highlighter has no more tokens for the current fragment - the scorer returns
+ the weighting it has derived for the most recent fragment, typically based on the tokens
+ passed to getTokenScore().
+
+ </summary>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryScorer.#ctor(Lucene.Net.Search.Query)">
+ <summary> </summary>
+ <param name="query">a Lucene query (ideally rewritten using query.rewrite
+ before being passed to this class and the searcher)
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryScorer.#ctor(Lucene.Net.Search.Query,System.String)">
+ <summary> </summary>
+ <param name="query">a Lucene query (ideally rewritten using query.rewrite
+ before being passed to this class and the searcher)
+ </param>
+ <param name="fieldName">the Field name which is used to match Query terms
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryScorer.#ctor(Lucene.Net.Search.Query,Lucene.Net.Index.IndexReader,System.String)">
+ <summary> </summary>
+ <param name="query">a Lucene query (ideally rewritten using query.rewrite
+ before being passed to this class and the searcher)
+ </param>
+ <param name="reader">used to compute IDF which can be used to a) score selected fragments better
+ b) use graded highlights eg set font color intensity
+ </param>
+ <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryScorer.GetMaxTermWeight">
+ <summary> </summary>
+ <returns> The highest weighted term (useful for passing to GradientFormatter to set
+ top end of coloring scale.
+ </returns>
+ </member>
+ <member name="T:Lucene.Net.Highlight.QueryTermExtractor">
+ <summary> Utility class used to extract the terms used in a query, plus any weights.
+ This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
+ so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of
+ expanded terms.
+
+ </summary>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryTermExtractor.GetTerms(Lucene.Net.Search.Query)">
+ <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
+
+ </summary>
+ <param name="query"> Query to extract term texts from
+ </param>
+ <returns> an array of the terms used in a query, plus their weights.
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryTermExtractor.GetIdfWeightedTerms(Lucene.Net.Search.Query,Lucene.Net.Index.IndexReader,System.String)">
+ <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
+
+ </summary>
+ <param name="query"> Query to extract term texts from
+ </param>
+ <param name="reader">used to compute IDF which can be used to a) score selected fragments better
+ b) use graded highlights eg chaning intensity of font color
+ </param>
+ <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based
+ </param>
+ <returns> an array of the terms used in a query, plus their weights.
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryTermExtractor.GetTerms(Lucene.Net.Search.Query,System.Boolean,System.String)">
+ <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
+
+ </summary>
+ <param name="query"> Query to extract term texts from
+ </param>
+ <param name="prohibited"><code>true</code> to extract "prohibited" terms, too
+ </param>
+ <param name="fieldName"> The fieldName used to filter query terms
+ </param>
+ <returns> an array of the terms used in a query, plus their weights.
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.QueryTermExtractor.GetTerms(Lucene.Net.Search.Query,System.Boolean)">
+ <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
+
+ </summary>
+ <param name="query"> Query to extract term texts from
+ </param>
+ <param name="prohibited"><code>true</code> to extract "prohibited" terms, too
+ </param>
+ <returns> an array of the terms used in a query, plus their weights.
+ </returns>
+ </member>
+ <member name="T:Lucene.Net.Highlight.SimpleFragmenter">
+ <summary> {@link Fragmenter} implementation which breaks text up into same-size
+ fragments with no concerns over spotting sentence boundaries.
+ </summary>
+ <author> mark@searcharea.co.uk
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.SimpleFragmenter.#ctor(System.Int32)">
+ <summary> </summary>
+ <param name="fragmentSize">size in bytes of each fragment
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.SimpleFragmenter.GetFragmentSize">
+ <returns> size in bytes of each fragment
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.SimpleFragmenter.SetFragmentSize(System.Int32)">
+ <param name="size">size in bytes of each fragment
+ </param>
+ </member>
+ <member name="T:Lucene.Net.Highlight.SimpleHTMLFormatter">
+ <summary> Simple {@link Formatter} implementation to highlight terms with a pre and post tag</summary>
+ <author> MAHarwood
+
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.SimpleHTMLFormatter.#ctor">
+ <summary> Default constructor uses HTML: <B> tags to markup terms
+
+
+ </summary>
+ </member>
+ <member name="T:Lucene.Net.Highlight.TextFragment">
+ <summary> Low-level class used to record information about a section of a document
+ with a score.
+ </summary>
+ <author> MAHarwood
+
+
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TextFragment.Merge(Lucene.Net.Highlight.TextFragment)">
+ <param name="frag2">Fragment to be merged into this one
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TextFragment.Follows(Lucene.Net.Highlight.TextFragment)">
+ <param name="fragment">
+ </param>
+ <returns> true if this fragment follows the one passed
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TextFragment.GetFragNum">
+ <returns> the fragment sequence number
+ </returns>
+ </member>
+ <member name="T:Lucene.Net.Highlight.TokenGroup">
+ <summary> One, or several overlapping tokens, along with the score(s) and the
+ scope of the original text
+ </summary>
+ <author> MAHarwood
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenGroup.GetToken(System.Int32)">
+ <summary> </summary>
+ <param name="index">a value between 0 and numTokens -1
+ </param>
+ <returns> the "n"th token
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenGroup.GetScore(System.Int32)">
+ <summary> </summary>
+ <param name="index">a value between 0 and numTokens -1
+ </param>
+ <returns> the "n"th score
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenGroup.GetEndOffset">
+ <returns> the end position in the original text
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenGroup.GetNumTokens">
+ <returns> the number of tokens in this group
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenGroup.GetStartOffset">
+ <returns> the start position in the original text
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenGroup.GetTotalScore">
+ <returns> all tokens' scores summed up
+ </returns>
+ </member>
+ <member name="T:Lucene.Net.Highlight.WeightedTerm">
+ <summary>Lightweight class to hold term and a weight value used for scoring this term </summary>
+ <author> Mark Harwood
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.WeightedTerm.GetTerm">
+ <returns> the term value (stemmed)
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.WeightedTerm.GetWeight">
+ <returns> the weight associated with this term
+ </returns>
+ </member>
+ <member name="M:Lucene.Net.Highlight.WeightedTerm.SetTerm(System.String)">
+ <param name="term">the term value (stemmed)
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.WeightedTerm.SetWeight(System.Single)">
+ <param name="weight">the weight associated with this term
+ </param>
+ </member>
+ <member name="T:Lucene.Net.Highlight.DefaultEncoder">
+ <summary> Simple {@link Encoder} implementation that does not modify the output</summary>
+ <author> Nicko Cadell
+
+ </author>
+ </member>
+ <member name="T:Lucene.Net.Highlight.Encoder">
+ <summary> Encodes original text. The Encoder works with the Formatter to generate the output.
+
+ </summary>
+ <author> Nicko Cadell
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.Encoder.EncodeText(System.String)">
+ <param name="originalText">The section of text being output
+ </param>
+ </member>
+ <member name="T:Lucene.Net.Highlight.GradientFormatter">
+ <summary> Formats text with different color intensity depending on the score of the
+ term.
+
+ </summary>
+ <author> maharwood
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.GradientFormatter.#ctor(System.Single,System.String,System.String,System.String,System.String)">
+ <summary> Sets the color range for the IDF scores
+
+ </summary>
+ <param name="">maxScore
+ The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight
+ which can be used to callibrate scoring scale)
+ </param>
+ <param name="">minForegroundColor
+ The hex color used for representing IDF scores of zero eg
+ #FFFFFF (white) or null if no foreground color required
+ </param>
+ <param name="">maxForegroundColor
+ The largest hex color used for representing IDF scores eg
+ #000000 (black) or null if no foreground color required
+ </param>
+ <param name="">minBackgroundColor
+ The hex color used for representing IDF scores of zero eg
+ #FFFFFF (white) or null if no background color required
+ </param>
+ <param name="">maxBackgroundColor
+ The largest hex color used for representing IDF scores eg
+ #000000 (black) or null if no background color required
+ </param>
+ </member>
+ <member name="M:Lucene.Net.Highlight.GradientFormatter.HexToInt(System.String)">
+ <summary> Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the
+ input is nonnegative unless there is a preceding minus sign. This method
+ reads the input as twos complement instead, so if the input is 8 bytes
+ long, it will correctly restore a negative int produced by
+ Integer.toHexString() but not neccesarily one produced by
+ Integer.toString(x,16) since that method will produce a string like '-FF'
+ for negative integer values.
+
+ </summary>
+ <param name="">hex
+ A string in capital or lower case hex, of no more then 16
+ characters.
+ </param>
+ <throws> NumberFormatException </throws>
+ <summary> if the string is more than 16 characters long, or if any
+ character is not in the set [0-9a-fA-f]
+ </summary>
+ </member>
+ <member name="T:Lucene.Net.Highlight.NullFragmenter">
+ <summary> {@link Fragmenter} implementation which does not fragment the text.
+ This is useful for highlighting the entire content of a document or field.
+ </summary>
+ </member>
+ <member name="T:Lucene.Net.Highlight.SimpleHTMLEncoder">
+ <summary> Simple {@link Encoder} implementation to escape text for HTML output</summary>
+ <author> Nicko Cadell
+
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.SimpleHTMLEncoder.HtmlEncode(System.String)">
+ <summary> Encode string into HTML</summary>
+ </member>
+ <member name="T:Lucene.Net.Highlight.SpanGradientFormatter">
+ <summary> Formats text with different color intensity depending on the score of the
+ term using the span tag. GradientFormatter uses a bgcolor argument to the font tag which
+ doesn't work in Mozilla, thus this class.
+
+ </summary>
+ <seealso cref="T:Lucene.Net.Highlight.GradientFormatter">
+ </seealso>
+ <author> David Spencer dave@searchmorph.com
+ </author>
+ </member>
+ <member name="T:Lucene.Net.Highlight.TokenSources">
+ <summary> Hides implementation issues associated with obtaining a TokenStream for use with
+ the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
+ from Analyzer class reparsing the stored content.
+ </summary>
+ <author> maharwood
+ </author>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenSources.GetAnyTokenStream(Lucene.Net.Index.IndexReader,System.Int32,System.String,Lucene.Net.Analysis.Analyzer)">
+ <summary> A convenience method that tries a number of approaches to getting a token stream.
+ The cost of finding there are no termVectors in the index is minimal (1000 invocations still
+ registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
+ </summary>
+ <param name="">reader
+ </param>
+ <param name="">docId
+ </param>
+ <param name="">field
+ </param>
+ <param name="">analyzer
+ </param>
+ <returns> null if field not stored correctly
+ </returns>
+ <throws> IOException </throws>
+ </member>
+ <member name="M:Lucene.Net.Highlight.TokenSources.GetTokenStream(Lucene.Net.Index.TermPositionVector,System.Boolean)">
+ <summary> Low level api.
+ Returns a token stream or null if no offset info available in index.
+ This can be used to feed the highlighter with a pre-parsed token stream
+
+ In my tests the speeds to recreate 1000 token streams using this method are:
+ - with TermVector offset only data stored - 420 milliseconds
+ - with TermVector offset AND position data stored - 271 milliseconds
+ (nb timings for TermVector with position data are based on a tokenizer with contiguous
+ positions - no overlaps or gaps)
+ The cost of not using TermPositionVector to store
+ pre-parsed content and using an analyzer to re-parse the original content:
+ - reanalyzing the original content - 980 milliseconds
+
+ The re-analyze timings will typically vary depending on -
+ 1) The complexity of the analyzer code (timings above were using a
+ stemmer/lowercaser/stopword combo)
+ 2) The number of other fields (Lucene reads ALL fields off the disk
+ when accessing just one document field - can cost dear!)
+ 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
+ or slower (more CPU burn) depending on the content.
+
+ </summary>
+ <param name="">tpv
+ </param>
+ <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
+ to eek out the last drops of performance, set to true. If in doubt, set to false.
+ </param>
+ </member>
+ </members>
+</doc>
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Highlighter.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Highlighter.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,522 @@
+/*
+ * Copyright 2002-2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using PriorityQueue = Lucene.Net.Util.PriorityQueue;
+
+namespace Lucene.Net.Highlight
+{
+
+ /// <summary> Class used to markup highlighted terms found in the best sections of a
+ /// text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
+ /// {@link Encoder} and tokenizers.
+ /// </summary>
+ /// <author> mark@searcharea.co.uk
+ /// </author>
+ public class Highlighter
+ {
+
+ public const int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE = 50 * 1024;
+ private int maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+ private Formatter formatter;
+ private Encoder encoder;
+ private Fragmenter textFragmenter = new SimpleFragmenter();
+ private Scorer fragmentScorer = null;
+
+ public Highlighter(Scorer fragmentScorer) : this(new SimpleHTMLFormatter(), fragmentScorer)
+ {
+ }
+
+
+ public Highlighter(Formatter formatter, Scorer fragmentScorer) : this(formatter, new DefaultEncoder(), fragmentScorer)
+ {
+ }
+
+
+ public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
+ {
+ this.formatter = formatter;
+ this.encoder = encoder;
+ this.fragmentScorer = fragmentScorer;
+ }
+
+ /// <summary> Highlights chosen terms in a text, extracting the most relevant section.
+ /// This is a convenience method that calls
+ /// {@link #GetBestFragment(TokenStream, String)}
+ ///
+ /// </summary>
+ /// <param name="analyzer"> the analyzer that will be used to split <code>text</code>
+ /// into chunks
+ /// </param>
+ /// <param name="text">text to highlight terms in
+ /// </param>
+ /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy
+ ///
+ /// </param>
+ /// <returns> highlighted text fragment or null if no terms found
+ /// </returns>
+ public System.String GetBestFragment(Analyzer analyzer, System.String fieldName, System.String text)
+ {
+ TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));
+ return GetBestFragment(tokenStream, text);
+ }
+
+ /// <summary> Highlights chosen terms in a text, extracting the most relevant section.
+ /// The document text is analysed in chunks to record hit statistics
+ /// across the document. After accumulating stats, the fragment with the highest score
+ /// is returned
+ ///
+ /// </summary>
+ /// <param name="tokenStream"> a stream of tokens identified in the text parameter, including offset information.
+ /// This is typically produced by an analyzer re-parsing a document's
+ /// text. Some work may be done on retrieving TokenStreams more efficently
+ /// by adding support for storing original text position data in the Lucene
+ /// index but this support is not currently available (as of Lucene 1.4 rc2).
+ /// </param>
+ /// <param name="text">text to highlight terms in
+ ///
+ /// </param>
+ /// <returns> highlighted text fragment or null if no terms found
+ /// </returns>
+ public System.String GetBestFragment(TokenStream tokenStream, System.String text)
+ {
+ System.String[] results = GetBestFragments(tokenStream, text, 1);
+ if (results.Length > 0)
+ {
+ return results[0];
+ }
+ return null;
+ }
+
+ /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
+ /// This is a convenience method that calls
+ /// {@link #getBestFragments(TokenStream, String, int)}
+ ///
+ /// </summary>
+ /// <param name="analyzer"> the analyzer that will be used to split <code>text</code>
+ /// into chunks
+ /// </param>
+ /// <param name="text"> text to highlight terms in
+ /// </param>
+ /// <param name="maxNumFragments"> the maximum number of fragments.
+ /// </param>
+ /// <deprecated> This method incorrectly hardcodes the choice of fieldname. Use the
+ /// method of the same name that takes a fieldname.
+ /// </deprecated>
+ /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
+ /// </returns>
+ public System.String[] GetBestFragments(Analyzer analyzer, System.String text, int maxNumFragments)
+ {
+ TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text));
+ return GetBestFragments(tokenStream, text, maxNumFragments);
+ }
+ /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
+ /// This is a convenience method that calls
+ /// {@link #getBestFragments(TokenStream, String, int)}
+ ///
+ /// </summary>
+ /// <param name="analyzer"> the analyzer that will be used to split <code>text</code>
+ /// into chunks
+ /// </param>
+ /// <param name="fieldName"> the name of the field being highlighted (used by analyzer)
+ /// </param>
+ /// <param name="text"> text to highlight terms in
+ /// </param>
+ /// <param name="maxNumFragments"> the maximum number of fragments.
+ ///
+ /// </param>
+ /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
+ /// </returns>
+ public System.String[] GetBestFragments(Analyzer analyzer, System.String fieldName, System.String text, int maxNumFragments)
+ {
+ TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));
+ return GetBestFragments(tokenStream, text, maxNumFragments);
+ }
+
+ /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
+ /// The document text is analysed in chunks to record hit statistics
+ /// across the document. After accumulating stats, the fragments with the highest scores
+ /// are returned as an array of strings in order of score (contiguous fragments are merged into
+ /// one in their original order to improve readability)
+ ///
+ /// </summary>
+ /// <param name="text"> text to highlight terms in
+ /// </param>
+ /// <param name="maxNumFragments"> the maximum number of fragments.
+ ///
+ /// </param>
+ /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
+ /// </returns>
+ public System.String[] GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments)
+ {
+ maxNumFragments = System.Math.Max(1, maxNumFragments); //sanity check
+
+ TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
+
+ //Get text
+ System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
+ for (int i = 0; i < frag.Length; i++)
+ {
+ if ((frag[i] != null) && (frag[i].GetScore() > 0))
+ {
+ fragTexts.Add(frag[i].ToString());
+ }
+ }
+ return (System.String[]) fragTexts.ToArray(typeof(System.String));
+ }
+
+
+ /// <summary> Low level api to get the most relevant (formatted) sections of the document.
+ /// This method has been made public to allow visibility of score information held in TextFragment objects.
+ /// Thanks to Jason Calabrese for help in redefining the interface.
+ /// </summary>
+ /// <param name="">tokenStream
+ /// </param>
+ /// <param name="">text
+ /// </param>
+ /// <param name="">maxNumFragments
+ /// </param>
+ /// <param name="">mergeContiguousFragments
+ /// </param>
+ /// <throws> IOException </throws>
+ public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments)
+ {
+ System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
+ System.Text.StringBuilder newText = new System.Text.StringBuilder();
+
+ TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
+ fragmentScorer.StartFragment(currentFrag);
+ docFrags.Add(currentFrag);
+
+ FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
+
+ try
+ {
+ Lucene.Net.Analysis.Token token;
+ System.String tokenText;
+ int startOffset;
+ int endOffset;
+ int lastEndOffset = 0;
+ textFragmenter.Start(text);
+
+ TokenGroup tokenGroup = new TokenGroup();
+
+ while ((token = tokenStream.Next()) != null)
+ {
+ if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
+ {
+ //the current token is distinct from previous tokens -
+ // markup the cached token group info
+ startOffset = tokenGroup.startOffset;
+ endOffset = tokenGroup.endOffset;
+ tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
+ System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
+ //store any whitespace etc from between this and last group
+ if (startOffset > lastEndOffset)
+ newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
+ newText.Append(markedUpText);
+ lastEndOffset = endOffset;
+ tokenGroup.Clear();
+
+ //check if current token marks the start of a new fragment
+ if (textFragmenter.IsNewFragment(token))
+ {
+ currentFrag.SetScore(fragmentScorer.GetFragmentScore());
+ //record stats for a new fragment
+ currentFrag.textEndPos = newText.Length;
+ currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
+ fragmentScorer.StartFragment(currentFrag);
+ docFrags.Add(currentFrag);
+ }
+ }
+
+ tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));
+
+ if (lastEndOffset > maxDocBytesToAnalyze)
+ {
+ break;
+ }
+ }
+ currentFrag.SetScore(fragmentScorer.GetFragmentScore());
+
+ if (tokenGroup.numTokens > 0)
+ {
+ //flush the accumulated text (same code as in above loop)
+ startOffset = tokenGroup.startOffset;
+ endOffset = tokenGroup.endOffset;
+ tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
+ System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
+ //store any whitespace etc from between this and last group
+ if (startOffset > lastEndOffset)
+ newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
+ newText.Append(markedUpText);
+ lastEndOffset = endOffset;
+ }
+
+ // append text after end of last token
+ // if (lastEndOffset < text.length())
+ // newText.append(encoder.encodeText(text.substring(lastEndOffset)));
+
+ currentFrag.textEndPos = newText.Length;
+
+ //sort the most relevant sections of the text
+ for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
+ {
+ currentFrag = (TextFragment) i.Current;
+
+ //If you are running with a version of Lucene before 11th Sept 03
+ // you do not have PriorityQueue.insert() - so uncomment the code below
+ /*
+ if (currentFrag.getScore() >= minScore)
+ {
+ fragQueue.put(currentFrag);
+ if (fragQueue.size() > maxNumFragments)
+ { // if hit queue overfull
+ fragQueue.pop(); // remove lowest in hit queue
+ minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
+ }
+
+
+ }
+ */
+ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+ //fix to PriorityQueue. The correct method to use here is the new "insert" method
+ // USE ABOVE CODE IF THIS DOES NOT COMPILE!
+ fragQueue.Insert(currentFrag);
+ }
+
+ //return the most relevant fragments
+ TextFragment[] frag = new TextFragment[fragQueue.Size()];
+ for (int i = frag.Length - 1; i >= 0; i--)
+ {
+ frag[i] = (TextFragment) fragQueue.Pop();
+ }
+
+ //merge any contiguous fragments to improve readability
+ if (mergeContiguousFragments)
+ {
+ MergeContiguousFragments(frag);
+ System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
+ for (int i = 0; i < frag.Length; i++)
+ {
+ if ((frag[i] != null) && (frag[i].GetScore() > 0))
+ {
+ fragTexts.Add(frag[i]);
+ }
+ }
+ frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
+ }
+
+ return frag;
+ }
+ finally
+ {
+ if (tokenStream != null)
+ {
+ try
+ {
+ tokenStream.Close();
+ }
+ catch (System.Exception e)
+ {
+ }
+ }
+ }
+ }
+
+
+ /// <summary>Improves readability of a score-sorted list of TextFragments by merging any fragments
+ /// that were contiguous in the original text into one larger fragment with the correct order.
+ /// This will leave a "null" in the array entry for the lesser scored fragment.
+ ///
+ /// </summary>
+ /// <param name="frag">An array of document fragments in descending score
+ /// </param>
+ private void MergeContiguousFragments(TextFragment[] frag)
+ {
+ bool mergingStillBeingDone;
+ if (frag.Length > 1)
+ do
+ {
+ mergingStillBeingDone = false; //initialise loop control flag
+ //for each fragment, scan other frags looking for contiguous blocks
+ for (int i = 0; i < frag.Length; i++)
+ {
+ if (frag[i] == null)
+ {
+ continue;
+ }
+ //merge any contiguous blocks
+ for (int x = 0; x < frag.Length; x++)
+ {
+ if (frag[x] == null)
+ {
+ continue;
+ }
+ if (frag[i] == null)
+ {
+ break;
+ }
+ TextFragment frag1 = null;
+ TextFragment frag2 = null;
+ int frag1Num = 0;
+ int frag2Num = 0;
+ int bestScoringFragNum;
+ int worstScoringFragNum;
+ //if blocks are contiguous....
+ if (frag[i].Follows(frag[x]))
+ {
+ frag1 = frag[x];
+ frag1Num = x;
+ frag2 = frag[i];
+ frag2Num = i;
+ }
+ else if (frag[x].Follows(frag[i]))
+ {
+ frag1 = frag[i];
+ frag1Num = i;
+ frag2 = frag[x];
+ frag2Num = x;
+ }
+ //merging required..
+ if (frag1 != null)
+ {
+ if (frag1.GetScore() > frag2.GetScore())
+ {
+ bestScoringFragNum = frag1Num;
+ worstScoringFragNum = frag2Num;
+ }
+ else
+ {
+ bestScoringFragNum = frag2Num;
+ worstScoringFragNum = frag1Num;
+ }
+ frag1.Merge(frag2);
+ frag[worstScoringFragNum] = null;
+ mergingStillBeingDone = true;
+ frag[bestScoringFragNum] = frag1;
+ }
+ }
+ }
+ }
+ while (mergingStillBeingDone);
+ }
+
+
+ /// <summary> Highlights terms in the text , extracting the most relevant sections
+ /// and concatenating the chosen fragments with a separator (typically "...").
+ /// The document text is analysed in chunks to record hit statistics
+ /// across the document. After accumulating stats, the fragments with the highest scores
+ /// are returned in order as "separator" delimited strings.
+ ///
+ /// </summary>
+ /// <param name="text"> text to highlight terms in
+ /// </param>
+ /// <param name="maxNumFragments"> the maximum number of fragments.
+ /// </param>
+ /// <param name="separator"> the separator used to intersperse the document fragments (typically "...")
+ ///
+ /// </param>
+ /// <returns> highlighted text
+ /// </returns>
+ public System.String GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments, System.String separator)
+ {
+ System.String[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
+ System.Text.StringBuilder result = new System.Text.StringBuilder();
+ for (int i = 0; i < sections.Length; i++)
+ {
+ if (i > 0)
+ {
+ result.Append(separator);
+ }
+ result.Append(sections[i]);
+ }
+ return result.ToString();
+ }
+
+ /// <returns> the maximum number of bytes to be tokenized per doc
+ /// </returns>
+ public virtual int GetMaxDocBytesToAnalyze()
+ {
+ return maxDocBytesToAnalyze;
+ }
+
+ /// <param name="byteCount">the maximum number of bytes to be tokenized per doc
+ /// (This can improve performance with large documents)
+ /// </param>
+ public virtual void SetMaxDocBytesToAnalyze(int byteCount)
+ {
+ maxDocBytesToAnalyze = byteCount;
+ }
+
+
+ public virtual Fragmenter GetTextFragmenter()
+ {
+ return textFragmenter;
+ }
+
+ /// <param name="">fragmenter
+ /// </param>
+ public virtual void SetTextFragmenter(Fragmenter fragmenter)
+ {
+ textFragmenter = fragmenter;
+ }
+
+ /// <returns> Object used to score each text fragment
+ /// </returns>
+ public virtual Scorer GetFragmentScorer()
+ {
+ return fragmentScorer;
+ }
+
+
+ /// <param name="">scorer
+ /// </param>
+ public virtual void SetFragmentScorer(Scorer scorer)
+ {
+ fragmentScorer = scorer;
+ }
+
+ public virtual Encoder GetEncoder()
+ {
+ return encoder;
+ }
+ public virtual void SetEncoder(Encoder encoder)
+ {
+ this.encoder = encoder;
+ }
+ }
+
+ class FragmentQueue : PriorityQueue
+ {
+ public FragmentQueue(int size)
+ {
+ Initialize(size);
+ }
+
+ public override bool LessThan(System.Object a, System.Object b)
+ {
+ TextFragment fragA = (TextFragment) a;
+ TextFragment fragB = (TextFragment) b;
+ if (fragA.GetScore() == fragB.GetScore())
+ return fragA.fragNum > fragB.fragNum;
+ else
+ return fragA.GetScore() < fragB.GetScore();
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/NullFragmenter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/NullFragmenter.cs?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/NullFragmenter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/NullFragmenter.cs Wed Dec 27 07:05:35 2006
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using Token = Lucene.Net.Analysis.Token;
+
+namespace Lucene.Net.Highlight
+{
+
+ /// <summary> {@link Fragmenter} implementation which does not fragment the text.
+ /// This is useful for highlighting the entire content of a document or field.
+ /// </summary>
+ public class NullFragmenter : Fragmenter
+ {
+ public virtual void Start(System.String s)
+ {
+ }
+
+ public virtual bool IsNewFragment(Token token)
+ {
+ return false;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Highlighter.Net/Highlighter.Net/Package.html?view=auto&rev=490508
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Package.html (added)
+++ incubator/lucene.net/trunk/C#/contrib/Highlighter.Net/Highlighter.Net/Package.html Wed Dec 27 07:05:35 2006
@@ -0,0 +1,60 @@
+<html>
+ <body>
+ The highlight package contains classes to provide "keyword in context" features
+ typically used to highlight search terms in the text of results pages. The
+ Highlighter class is the central component and can be used to extract the most
+ interesting sections of a piece of text and highlight them, with the help of
+ Fragmenter, FragmentScorer, Formatter classes.
+ <h2>Example Usage</h2>
+ <pre>
+ IndexSearcher searcher = new IndexSearcher(ramDir);
+ Query query = QueryParser.Parse("Kenne*", FIELD_NAME, analyzer);
+ query = query.Rewrite(reader); //required to expand search terms
+ Hits hits = searcher.Search(query);
+
+ Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
+ for (int i = 0; i < hits.Length(); i++)
+ {
+ String text = hits.Doc(i).Get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text));
+ // Get 3 best fragments and seperate with a "..."
+ String result = highlighter.GetBestFragments(tokenStream, text, 3, "...");
+ System.Out.Console.WriteLine(result);
+ }
+</pre>
+ <h2>New features 06/02/2005</h2>
+ This release adds options for encoding (thanks to Nicko Cadell). An "Encoder"
+ implementation such as the new SimpleHTMLEncoder class can be passed to the
+ highlighter to encode all those non-xhtml standard characters such as &
+ into legal values. This simple class may not suffice for some languages -
+ Commons Lang has an implementation that could be used: escapeHtml(String) in
+ http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
+ <h2>New features 22/12/2004</h2>
+ This release adds some new capabilities:
+ <ol>
+ <li>
+ Faster highlighting using Term vector support</li>
+ <li>
+ New formatting options to use color intensity to show informational value</li>
+ <li>
+ Options for better summarization by using term IDF scores to influence fragment
+ selection</li>
+ </ol>
+ <p>
+ The highlighter takes a TokenStream as input. Until now these streams have
+ typically been produced using an Analyzer but the new class TokenSources
+ provides helper methods for obtaining TokenStreams from the new TermVector
+ position support (see latest CVS version).</p>
+ <p>The new class GradientFormatter can use a scale of colors to highlight terms
+ according to their score. A subtle use of color can help emphasise the reasons
+ for matching (useful when doing "MoreLikeThis" queries and you want to see what
+ the basis of the similarities are).</p>
+ <p>The QueryScorer class has a new constructor which can use an IndexReader to
+ derive the IDF (inverse document frequency) for each term in order to
+ influcence the score. This is useful for helping to extracting the most
+ significant sections of a document and in supplying scores used by the new
+ GradientFormatter to color significant words more strongly. The
+ QueryScorer.getMaxWeight method is useful when passed to the GradientFormatter
+ constructor to define the top score which is associated with the top color.</p>
+ </body>
+</html>