You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/04/07 01:37:54 UTC
svn commit: r1310635 [2/8] - in /incubator/lucene.net/trunk:
build/vs2010/contrib/ build/vs2010/test/ src/contrib/FastVectorHighlighter/
src/contrib/Highlighter/ src/contrib/Memory/ src/contrib/Memory/Properties/
src/contrib/Queries/ src/contrib/Querie...
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/Highlighter.cs Fri Apr 6 23:37:48 2012
@@ -16,511 +16,462 @@
*/
using System;
-using Lucene.Net.Search.Highlight;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
-using Analyzer = Lucene.Net.Analysis.Analyzer;
-using Token = Lucene.Net.Analysis.Token;
-using TokenStream = Lucene.Net.Analysis.TokenStream;
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
- /// <summary> Class used to markup highlighted terms found in the best sections of a
- /// text, using configurable <see cref="IFragmenter"/>, <see cref="IScorer"/>, <see cref="IFormatter"/>,
- /// <see cref="IEncoder"/> and tokenizers.
- /// </summary>
- /// <author> mark@searcharea.co.uk
- /// </author>
- public class Highlighter
- {
-
- public const int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE = 50 * 1024;
- private int maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
- private IFormatter formatter;
- private IEncoder encoder;
- private IFragmenter textFragmenter = new SimpleFragmenter();
- private IScorer fragmentScorer = null;
-
- public Highlighter(IScorer fragmentScorer) : this(new SimpleHTMLFormatter(), fragmentScorer)
- {
- }
-
-
- public Highlighter(IFormatter formatter, IScorer fragmentScorer) : this(formatter, new DefaultEncoder(), fragmentScorer)
- {
- }
-
-
- public Highlighter(IFormatter formatter, IEncoder encoder, IScorer fragmentScorer)
- {
- this.formatter = formatter;
- this.encoder = encoder;
- this.fragmentScorer = fragmentScorer;
- }
-
- /// <summary> Highlights chosen terms in a text, extracting the most relevant section.
- /// This is a convenience method that calls
- /// <see cref="GetBestFragment(TokenStream, String)"/>
- ///
- /// </summary>
- /// <param name="analyzer"> the analyzer that will be used to split <c>text</c>
- /// into chunks
- /// </param>
- /// <param name="text">text to highlight terms in
- /// </param>
- /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy
- ///
- /// </param>
- /// <returns> highlighted text fragment or null if no terms found
- /// </returns>
- public System.String GetBestFragment(Analyzer analyzer, System.String fieldName, System.String text)
- {
- TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));
- return GetBestFragment(tokenStream, text);
- }
-
- /// <summary> Highlights chosen terms in a text, extracting the most relevant section.
- /// The document text is analysed in chunks to record hit statistics
- /// across the document. After accumulating stats, the fragment with the highest score
- /// is returned
- ///
- /// </summary>
- /// <param name="tokenStream"> a stream of tokens identified in the text parameter, including offset information.
- /// This is typically produced by an analyzer re-parsing a document's
- /// text. Some work may be done on retrieving TokenStreams more efficently
- /// by adding support for storing original text position data in the Lucene
- /// index but this support is not currently available (as of Lucene 1.4 rc2).
- /// </param>
- /// <param name="text">text to highlight terms in
- ///
- /// </param>
- /// <returns> highlighted text fragment or null if no terms found
- /// </returns>
- public System.String GetBestFragment(TokenStream tokenStream, System.String text)
- {
- System.String[] results = GetBestFragments(tokenStream, text, 1);
- if (results.Length > 0)
- {
- return results[0];
- }
- return null;
- }
-
- /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
- /// This is a convenience method that calls
- /// <see cref="GetBestFragments(TokenStream, String, int)"/>
- ///
- /// </summary>
- /// <param name="analyzer"> the analyzer that will be used to split <c>text</c>
- /// into chunks
- /// </param>
- /// <param name="text"> text to highlight terms in
- /// </param>
- /// <param name="maxNumFragments"> the maximum number of fragments.
- /// </param>
- /// <deprecated> This method incorrectly hardcodes the choice of fieldname. Use the
- /// method of the same name that takes a fieldname.
- /// </deprecated>
- /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
- /// </returns>
- public System.String[] GetBestFragments(Analyzer analyzer, System.String text, int maxNumFragments)
- {
- TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text));
- return GetBestFragments(tokenStream, text, maxNumFragments);
- }
- /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
- /// This is a convenience method that calls
- /// <see cref="GetBestFragments(TokenStream, String, int)"/>
- ///
- /// </summary>
- /// <param name="analyzer"> the analyzer that will be used to split <c>text</c>
- /// into chunks
- /// </param>
- /// <param name="fieldName"> the name of the field being highlighted (used by analyzer)
- /// </param>
- /// <param name="text"> text to highlight terms in
- /// </param>
- /// <param name="maxNumFragments"> the maximum number of fragments.
- ///
- /// </param>
- /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
- /// </returns>
- public System.String[] GetBestFragments(Analyzer analyzer, System.String fieldName, System.String text, int maxNumFragments)
- {
- TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));
- return GetBestFragments(tokenStream, text, maxNumFragments);
- }
-
- /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
- /// The document text is analysed in chunks to record hit statistics
- /// across the document. After accumulating stats, the fragments with the highest scores
- /// are returned as an array of strings in order of score (contiguous fragments are merged into
- /// one in their original order to improve readability)
- ///
- /// </summary>
- /// <param name="text"> text to highlight terms in
- /// </param>
- /// <param name="maxNumFragments"> the maximum number of fragments.
- ///
- /// </param>
- /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
- /// </returns>
- public System.String[] GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments)
- {
- maxNumFragments = System.Math.Max(1, maxNumFragments); //sanity check
-
- TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
-
- //Get text
- System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
- for (int i = 0; i < frag.Length; i++)
- {
- if ((frag[i] != null) && (frag[i].GetScore() > 0))
- {
- fragTexts.Add(frag[i].ToString());
- }
- }
- return (System.String[]) fragTexts.ToArray(typeof(System.String));
- }
-
-
- /// <summary> Low level api to get the most relevant (formatted) sections of the document.
- /// This method has been made public to allow visibility of score information held in TextFragment objects.
- /// Thanks to Jason Calabrese for help in redefining the interface.
- /// </summary>
+ /// <summary>
+ /// Class used to markup highlighted terms found in the best sections of a
+ /// text, using configurable <see cref="IFragmenter"/>, <see cref="Scorer"/>, <see cref="IFormatter"/>,
+ /// <see cref="IEncoder"/> and tokenizers.
+ /// </summary>
+ public class Highlighter
+ {
+ public static readonly int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
+
+ private int _maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
+ private IFormatter _formatter;
+ private IEncoder _encoder;
+ private IFragmenter _textFragmenter = new SimpleFragmenter();
+ private IScorer _fragmentScorer = null;
+
+ public Highlighter(IScorer fragmentScorer)
+ : this(new SimpleHTMLFormatter(), fragmentScorer)
+ {
+ }
+
+
+ public Highlighter(IFormatter formatter, IScorer fragmentScorer)
+ : this(formatter, new DefaultEncoder(), fragmentScorer)
+ {
+ }
+
+
+ public Highlighter(IFormatter formatter, IEncoder encoder, IScorer fragmentScorer)
+ {
+ _formatter = formatter;
+ _encoder = encoder;
+ _fragmentScorer = fragmentScorer;
+ }
+
+ /// <summary>
+ /// Highlights chosen terms in a text, extracting the most relevant section.
+ /// This is a convenience method that calls <see cref="GetBestFragment(TokenStream, string)"/>
+ /// </summary>
+ /// <param name="analyzer">the analyzer that will be used to split <code>text</code> into chunks</param>
+ /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy</param>
+ /// <param name="text">text to highlight terms in</param>
+ /// <returns>highlighted text fragment or null if no terms found</returns>
+ /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
+ public String GetBestFragment(Analyzer analyzer, String fieldName, String text)
+ {
+ TokenStream tokenStream = analyzer.TokenStream(fieldName, new StringReader(text));
+ return GetBestFragment(tokenStream, text);
+ }
+
+ /// <summary>
+ /// Highlights chosen terms in a text, extracting the most relevant section.
+ /// The document text is analysed in chunks to record hit statistics
+ /// across the document. After accumulating stats, the fragment with the highest score
+ /// is returned
+ /// </summary>
/// <param name="tokenStream">
- /// </param>
- /// <param name="text">
- /// </param>
- /// <param name="maxNumFragments">
- /// </param>
- /// <param name="mergeContiguousFragments">
- /// </param>
- /// <throws> IOException </throws>
- public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments)
- {
- //System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
- //System.Text.StringBuilder newText = new System.Text.StringBuilder();
-
- //TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
- //fragmentScorer.StartFragment(currentFrag);
- //docFrags.Add(currentFrag);
-
- //FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
-
- //try
- //{
- // Lucene.Net.Analysis.Token token;
- // System.String tokenText;
- // int startOffset;
- // int endOffset;
- // int lastEndOffset = 0;
- // textFragmenter.Start(text);
-
- // TokenGroup tokenGroup = new TokenGroup();
- // token = tokenStream.Next();
- // while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze))
- // {
- // if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
- // {
- // //the current token is distinct from previous tokens -
- // // markup the cached token group info
- // startOffset = tokenGroup.matchStartOffset;
- // endOffset = tokenGroup.matchEndOffset;
- // tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
- // System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
- // //store any whitespace etc from between this and last group
- // if (startOffset > lastEndOffset)
- // newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
- // newText.Append(markedUpText);
- // lastEndOffset = System.Math.Max(endOffset, lastEndOffset);
- // tokenGroup.Clear();
-
- // //check if current token marks the start of a new fragment
- // if (textFragmenter.IsNewFragment(token))
- // {
- // currentFrag.SetScore(fragmentScorer.GetFragmentScore());
- // //record stats for a new fragment
- // currentFrag.textEndPos = newText.Length;
- // currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
- // fragmentScorer.StartFragment(currentFrag);
- // docFrags.Add(currentFrag);
- // }
- // }
-
- // tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));
-
- // // if(lastEndOffset>maxDocBytesToAnalyze)
- // // {
- // // break;
- // // }
- // token = tokenStream.Next();
- // }
- // currentFrag.SetScore(fragmentScorer.GetFragmentScore());
-
- // if (tokenGroup.numTokens > 0)
- // {
- // //flush the accumulated text (same code as in above loop)
- // startOffset = tokenGroup.matchStartOffset;
- // endOffset = tokenGroup.matchEndOffset;
- // tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
- // System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
- // //store any whitespace etc from between this and last group
- // if (startOffset > lastEndOffset)
- // newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
- // newText.Append(markedUpText);
- // lastEndOffset = System.Math.Max(lastEndOffset, endOffset);
- // }
-
- // //Test what remains of the original text beyond the point where we stopped analyzing
- // if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze))
- // {
- // //append it to the last fragment
- // newText.Append(encoder.EncodeText(text.Substring(lastEndOffset)));
- // }
-
- // currentFrag.textEndPos = newText.Length;
-
- // //sort the most relevant sections of the text
- // for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
- // {
- // currentFrag = (TextFragment) i.Current;
-
- // //If you are running with a version of Lucene before 11th Sept 03
- // // you do not have PriorityQueue.insert() - so uncomment the code below
- // /*
- // if (currentFrag.getScore() >= minScore)
- // {
- // fragQueue.put(currentFrag);
- // if (fragQueue.size() > maxNumFragments)
- // { // if hit queue overfull
- // fragQueue.pop(); // remove lowest in hit queue
- // minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
- // }
-
-
- // }
- // */
- // //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
- // //fix to PriorityQueue. The correct method to use here is the new "insert" method
- // // USE ABOVE CODE IF THIS DOES NOT COMPILE!
- // fragQueue.Insert(currentFrag);
- // }
-
- // //return the most relevant fragments
- // TextFragment[] frag = new TextFragment[fragQueue.Size()];
- // for (int i = frag.Length - 1; i >= 0; i--)
- // {
- // frag[i] = (TextFragment) fragQueue.Pop();
- // }
-
- // //merge any contiguous fragments to improve readability
- // if (mergeContiguousFragments)
- // {
- // MergeContiguousFragments(frag);
- // System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
- // for (int i = 0; i < frag.Length; i++)
- // {
- // if ((frag[i] != null) && (frag[i].GetScore() > 0))
- // {
- // fragTexts.Add(frag[i]);
- // }
- // }
- // frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
- // }
-
- // return frag;
- //}
- //finally
- //{
- // if (tokenStream != null)
- // {
- // try
- // {
- // tokenStream.Close();
- // }
- // catch (System.Exception e)
- // {
- // }
- // }
- //}
- throw new NotImplementedException("Not yet ported to 3.0.3");
- }
-
-
- /// <summary>Improves readability of a score-sorted list of TextFragments by merging any fragments
- /// that were contiguous in the original text into one larger fragment with the correct order.
- /// This will leave a "null" in the array entry for the lesser scored fragment.
- ///
- /// </summary>
- /// <param name="frag">An array of document fragments in descending score
- /// </param>
- private void MergeContiguousFragments(TextFragment[] frag)
- {
- bool mergingStillBeingDone;
- if (frag.Length > 1)
- do
- {
- mergingStillBeingDone = false; //initialise loop control flag
- //for each fragment, scan other frags looking for contiguous blocks
- for (int i = 0; i < frag.Length; i++)
- {
- if (frag[i] == null)
- {
- continue;
- }
- //merge any contiguous blocks
- for (int x = 0; x < frag.Length; x++)
- {
- if (frag[x] == null)
- {
- continue;
- }
- if (frag[i] == null)
- {
- break;
- }
- TextFragment frag1 = null;
- TextFragment frag2 = null;
- int frag1Num = 0;
- int frag2Num = 0;
- int bestScoringFragNum;
- int worstScoringFragNum;
- //if blocks are contiguous....
- if (frag[i].Follows(frag[x]))
- {
- frag1 = frag[x];
- frag1Num = x;
- frag2 = frag[i];
- frag2Num = i;
- }
- else if (frag[x].Follows(frag[i]))
- {
- frag1 = frag[i];
- frag1Num = i;
- frag2 = frag[x];
- frag2Num = x;
- }
- //merging required..
- if (frag1 != null)
- {
- if (frag1.GetScore() > frag2.GetScore())
- {
- bestScoringFragNum = frag1Num;
- worstScoringFragNum = frag2Num;
- }
- else
- {
- bestScoringFragNum = frag2Num;
- worstScoringFragNum = frag1Num;
- }
- frag1.Merge(frag2);
- frag[worstScoringFragNum] = null;
- mergingStillBeingDone = true;
- frag[bestScoringFragNum] = frag1;
- }
- }
- }
- }
- while (mergingStillBeingDone);
- }
-
-
- /// <summary> Highlights terms in the text , extracting the most relevant sections
- /// and concatenating the chosen fragments with a separator (typically "...").
- /// The document text is analysed in chunks to record hit statistics
- /// across the document. After accumulating stats, the fragments with the highest scores
- /// are returned in order as "separator" delimited strings.
- ///
- /// </summary>
- /// <param name="text"> text to highlight terms in
- /// </param>
- /// <param name="maxNumFragments"> the maximum number of fragments.
- /// </param>
- /// <param name="separator"> the separator used to intersperse the document fragments (typically "...")
- ///
- /// </param>
- /// <returns> highlighted text
- /// </returns>
- public System.String GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments, System.String separator)
- {
- System.String[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
- System.Text.StringBuilder result = new System.Text.StringBuilder();
- for (int i = 0; i < sections.Length; i++)
- {
- if (i > 0)
- {
- result.Append(separator);
- }
- result.Append(sections[i]);
- }
- return result.ToString();
- }
-
- /// <returns> the maximum number of bytes to be tokenized per doc
- /// </returns>
- public virtual int GetMaxDocBytesToAnalyze()
- {
- return maxDocBytesToAnalyze;
- }
-
- /// <param name="byteCount">the maximum number of bytes to be tokenized per doc
- /// (This can improve performance with large documents)
- /// </param>
- public virtual void SetMaxDocBytesToAnalyze(int byteCount)
- {
- maxDocBytesToAnalyze = byteCount;
- }
-
-
- public virtual IFragmenter GetTextFragmenter()
- {
- return textFragmenter;
- }
-
- /// <param name="fragmenter"> </param>
- public virtual void SetTextFragmenter(IFragmenter fragmenter)
- {
- textFragmenter = fragmenter;
- }
-
- /// <returns> Object used to score each text fragment
- /// </returns>
- public virtual IScorer GetFragmentScorer()
- {
- return fragmentScorer;
- }
-
-
- /// <param name="scorer">
- /// </param>
- public virtual void SetFragmentScorer(IScorer scorer)
- {
- fragmentScorer = scorer;
- }
-
- public virtual IEncoder GetEncoder()
- {
- return encoder;
- }
- public virtual void SetEncoder(IEncoder encoder)
- {
- this.encoder = encoder;
- }
- }
-
- class FragmentQueue : PriorityQueue<TextFragment>
- {
- public FragmentQueue(int size)
- {
- Initialize(size);
- }
-
- public override bool LessThan(TextFragment a, TextFragment b)
- {
- if (a.GetScore() == b.GetScore())
- return a.fragNum > b.fragNum;
- else
- return a.GetScore() < b.GetScore();
- }
- }
+ /// a stream of tokens identified in the text parameter, including offset information.
+ /// This is typically produced by an analyzer re-parsing a document's
+ /// text. Some work may be done on retrieving TokenStreams more efficiently
+ /// by adding support for storing original text position data in the Lucene
+ /// index but this support is not currently available (as of Lucene 1.4 rc2).
+ /// </param>
+ /// <param name="text">text to highlight terms in</param>
+ /// <returns>highlighted text fragment or null if no terms found</returns>
+ /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
+ public String GetBestFragment(TokenStream tokenStream, String text)
+ {
+ String[] results = GetBestFragments(tokenStream, text, 1);
+ if (results.Length > 0)
+ {
+ return results[0];
+ }
+ return null;
+ }
+
+ /// <summary>
+ /// Highlights chosen terms in a text, extracting the most relevant sections.
+ /// This is a convenience method that calls <see cref="GetBestFragments(TokenStream, string, int)"/>
+ /// </summary>
+ /// <param name="analyzer">the analyzer that will be used to split <code>text</code> into chunks</param>
+ /// <param name="fieldName">the name of the field being highlighted (used by analyzer)</param>
+ /// <param name="text">text to highlight terms in</param>
+ /// <param name="maxNumFragments">the maximum number of fragments.</param>
+ /// <returns>highlighted text fragments (between 0 and maxNumFragments number of fragments)</returns>
+ /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
+ public String[] GetBestFragments(
+ Analyzer analyzer,
+ String fieldName,
+ String text,
+ int maxNumFragments)
+ {
+ TokenStream tokenStream = analyzer.TokenStream(fieldName, new StringReader(text));
+ return GetBestFragments(tokenStream, text, maxNumFragments);
+ }
+
+ /// <summary>
+ /// Highlights chosen terms in a text, extracting the most relevant sections.
+ /// The document text is analysed in chunks to record hit statistics
+ /// across the document. After accumulating stats, the fragments with the highest scores
+ /// are returned as an array of strings in order of score (contiguous fragments are merged into
+ /// one in their original order to improve readability)
+ /// </summary>
+ /// <param name="tokenStream"></param>
+ /// <param name="text">text to highlight terms in</param>
+ /// <param name="maxNumFragments">the maximum number of fragments.</param>
+ /// <returns>highlighted text fragments (between 0 and maxNumFragments number of fragments)</returns>
+ /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
+ public String[] GetBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
+ {
+ maxNumFragments = Math.Max(1, maxNumFragments); //sanity check
+
+ TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
+
+ //Get text
+ var fragTexts = new List<String>();
+ for (int i = 0; i < frag.Length; i++)
+ {
+ if ((frag[i] != null) && (frag[i].Score > 0))
+ {
+ fragTexts.Add(frag[i].ToString());
+ }
+ }
+ return fragTexts.ToArray();
+ }
+
+ /// <summary>
+ /// Low level api to get the most relevant (formatted) sections of the document.
+ /// This method has been made public to allow visibility of score information held in TextFragment objects.
+ /// Thanks to Jason Calabrese for help in redefining the interface.
+ /// </summary>
+ public TextFragment[] GetBestTextFragments(
+ TokenStream tokenStream,
+ String text,
+ bool mergeContiguousFragments,
+ int maxNumFragments)
+ {
+ var docFrags = new List<TextFragment>();
+ var newText = new StringBuilder();
+
+ var termAtt = tokenStream.AddAttribute<TermAttribute>();
+ var offsetAtt = tokenStream.AddAttribute<OffsetAttribute>();
+ tokenStream.AddAttribute<PositionIncrementAttribute>();
+ tokenStream.Reset();
+
+ var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
+ var newStream = _fragmentScorer.Init(tokenStream);
+ if (newStream != null)
+ {
+ tokenStream = newStream;
+ }
+ _fragmentScorer.StartFragment(currentFrag);
+ docFrags.Add(currentFrag);
+
+ var fragQueue = new FragmentQueue(maxNumFragments);
+
+ try
+ {
+
+ String tokenText;
+ int startOffset;
+ int endOffset;
+ int lastEndOffset = 0;
+ _textFragmenter.Start(text, tokenStream);
+
+ var tokenGroup = new TokenGroup(tokenStream);
+
+ for (bool next = tokenStream.IncrementToken();
+ next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
+ next = tokenStream.IncrementToken())
+ {
+ if ((offsetAtt.EndOffset > text.Length)
+ ||
+ (offsetAtt.StartOffset > text.Length)
+ )
+ {
+ throw new InvalidTokenOffsetsException("Token " + termAtt.Term()
+ + " exceeds length of provided text sized " + text.Length);
+ }
+ if ((tokenGroup.GetNumTokens() > 0) && (tokenGroup.IsDistinct()))
+ {
+ //the current token is distinct from previous tokens -
+ // markup the cached token group info
+ startOffset = tokenGroup.MatchStartOffset;
+ endOffset = tokenGroup.MatchEndOffset;
+ tokenText = text.Substring(startOffset, endOffset - startOffset);
+ String markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
+ //store any whitespace etc from between this and last group
+ if (startOffset > lastEndOffset)
+ newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
+ newText.Append(markedUpText);
+ lastEndOffset = Math.Max(endOffset, lastEndOffset);
+ tokenGroup.Clear();
+
+ //check if current token marks the start of a new fragment
+ if (_textFragmenter.IsNewFragment())
+ {
+ currentFrag.Score = _fragmentScorer.GetFragmentScore();
+ //record stats for a new fragment
+ currentFrag.TextEndPos = newText.Length;
+ currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
+ _fragmentScorer.StartFragment(currentFrag);
+ docFrags.Add(currentFrag);
+ }
+ }
+
+ tokenGroup.AddToken(_fragmentScorer.GetTokenScore());
+
+ // if(lastEndOffset>maxDocBytesToAnalyze)
+ // {
+ // break;
+ // }
+ }
+ currentFrag.Score = _fragmentScorer.GetFragmentScore();
+
+ if (tokenGroup.NumTokens > 0)
+ {
+ //flush the accumulated text (same code as in above loop)
+ startOffset = tokenGroup.MatchStartOffset;
+ endOffset = tokenGroup.MatchEndOffset;
+ tokenText = text.Substring(startOffset, endOffset - startOffset);
+ var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
+ //store any whitespace etc from between this and last group
+ if (startOffset > lastEndOffset)
+ newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
+ newText.Append(markedUpText);
+ lastEndOffset = Math.Max(lastEndOffset, endOffset);
+ }
+
+ //Test what remains of the original text beyond the point where we stopped analyzing
+ if (
+ // if there is text beyond the last token considered..
+ (lastEndOffset < text.Length)
+ &&
+ // and that text is not too large...
+ (text.Length <= _maxDocCharsToAnalyze)
+ )
+ {
+ //append it to the last fragment
+ newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
+ }
+
+ currentFrag.TextEndPos = newText.Length;
+
+ //sort the most relevant sections of the text
+ foreach (var f in docFrags)
+ {
+ currentFrag = f;
+
+ //If you are running with a version of Lucene before 11th Sept 03
+ // you do not have PriorityQueue.insert() - so uncomment the code below
+ /*
+ if (currentFrag.getScore() >= minScore)
+ {
+ fragQueue.put(currentFrag);
+ if (fragQueue.size() > maxNumFragments)
+ { // if hit queue overfull
+ fragQueue.pop(); // remove lowest in hit queue
+ minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
+ }
+
+
+ }
+ */
+ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+ //fix to PriorityQueue. The correct method to use here is the new "insert" method
+ // USE ABOVE CODE IF THIS DOES NOT COMPILE!
+ fragQueue.InsertWithOverflow(currentFrag);
+ }
+
+ //return the most relevant fragments
+ var frag = new TextFragment[fragQueue.Size()];
+ for (int i = frag.Length - 1; i >= 0; i--)
+ {
+ frag[i] = fragQueue.Pop();
+ }
+
+ //merge any contiguous fragments to improve readability
+ if (mergeContiguousFragments)
+ {
+ MergeContiguousFragments(frag);
+ frag = frag.Where(t => (t != null) && (t.Score > 0)).ToArray();
+ }
+
+ return frag;
+
+ }
+ finally
+ {
+ if (tokenStream != null)
+ {
+ try
+ {
+ tokenStream.Close();
+ }
+ catch (Exception)
+ {
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Improves readability of a score-sorted list of TextFragments by merging any fragments
+ /// that were contiguous in the original text into one larger fragment with the correct order.
+ /// This will leave a "null" in the array entry for the lesser scored fragment.
+ /// </summary>
+ /// <param name="frag">An array of document fragments in descending score</param>
+ private void MergeContiguousFragments(TextFragment[] frag)
+ {
+ bool mergingStillBeingDone;
+ if (frag.Length > 1)
+ do
+ {
+ mergingStillBeingDone = false; //initialise loop control flag
+ //for each fragment, scan other frags looking for contiguous blocks
+ for (int i = 0; i < frag.Length; i++)
+ {
+ if (frag[i] == null)
+ {
+ continue;
+ }
+ //merge any contiguous blocks
+ for (int x = 0; x < frag.Length; x++)
+ {
+ if (frag[x] == null)
+ {
+ continue;
+ }
+ if (frag[i] == null)
+ {
+ break;
+ }
+ TextFragment frag1 = null;
+ TextFragment frag2 = null;
+ int frag1Num = 0;
+ int frag2Num = 0;
+ int bestScoringFragNum;
+ int worstScoringFragNum;
+ //if blocks are contiguous....
+ if (frag[i].Follows(frag[x]))
+ {
+ frag1 = frag[x];
+ frag1Num = x;
+ frag2 = frag[i];
+ frag2Num = i;
+ }
+ else if (frag[x].Follows(frag[i]))
+ {
+ frag1 = frag[i];
+ frag1Num = i;
+ frag2 = frag[x];
+ frag2Num = x;
+ }
+ //merging required..
+ if (frag1 != null)
+ {
+ if (frag1.Score > frag2.Score)
+ {
+ bestScoringFragNum = frag1Num;
+ worstScoringFragNum = frag2Num;
+ }
+ else
+ {
+ bestScoringFragNum = frag2Num;
+ worstScoringFragNum = frag1Num;
+ }
+ frag1.Merge(frag2);
+ frag[worstScoringFragNum] = null;
+ mergingStillBeingDone = true;
+ frag[bestScoringFragNum] = frag1;
+ }
+ }
+ }
+ } while (mergingStillBeingDone);
+ }
+
+ /// <summary>
+ /// Highlights terms in the text , extracting the most relevant sections
+ /// and concatenating the chosen fragments with a separator (typically "...").
+ /// The document text is analysed in chunks to record hit statistics
+ /// across the document. After accumulating stats, the fragments with the highest scores
+ /// are returned in order as "separator" delimited strings.
+ /// </summary>
+ /// <param name="tokenStream"></param>
+ /// <param name="text">text to highlight terms in</param>
+ /// <param name="maxNumFragments">the maximum number of fragments.</param>
+ /// <param name="separator">the separator used to intersperse the document fragments (typically "...")</param>
+ /// <returns>highlighted text</returns>
+ public String GetBestFragments(
+ TokenStream tokenStream,
+ String text,
+ int maxNumFragments,
+ String separator)
+ {
+ string[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
+ StringBuilder result = new StringBuilder();
+ for (int i = 0; i < sections.Length; i++)
+ {
+ if (i > 0)
+ {
+ result.Append(separator);
+ }
+ result.Append(sections[i]);
+ }
+ return result.ToString();
+ }
+
+ public int MaxDocCharsToAnalyze
+ {
+ get { return _maxDocCharsToAnalyze; }
+ set { this._maxDocCharsToAnalyze = value; }
+ }
+
+
+ public IFragmenter TextFragmenter
+ {
+ get { return _textFragmenter; }
+ set { _textFragmenter = value; }
+ }
+
+ public IScorer FragmentScorer
+ {
+ get { return _fragmentScorer; }
+ set { _fragmentScorer = value; }
+ }
+
+ public IEncoder Encoder
+ {
+ get { return _encoder; }
+ set { this._encoder = value; }
+ }
+ }
+
+ internal class FragmentQueue : PriorityQueue<TextFragment>
+ {
+ public FragmentQueue(int size)
+ {
+ Initialize(size);
+ }
+
+ public override bool LessThan(TextFragment fragA, TextFragment fragB)
+ {
+ if (fragA.Score == fragB.Score)
+ return fragA.FragNum > fragB.FragNum;
+ else
+ return fragA.Score < fragB.Score;
+ }
+ }
}
Added: incubator/lucene.net/trunk/src/contrib/Highlighter/IFormatter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/IFormatter.cs?rev=1310635&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/IFormatter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/IFormatter.cs Fri Apr 6 23:37:48 2012
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Search.Highlight
+{
+ /// <summary> Processes terms found in the original text, typically by applying some form
+ /// of mark-up to highlight terms in HTML search results pages.</summary>
+ public interface IFormatter
+ {
+ /// <param name="originalText">The section of text being considered for markup</param>
+ /// <param name="tokenGroup">contains one or several overlapping Tokens along with
+ /// their scores and positions.</param>
+ string HighlightTerm(System.String originalText, TokenGroup tokenGroup);
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/src/contrib/Highlighter/IFragmenter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/IFragmenter.cs?rev=1310635&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/IFragmenter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/IFragmenter.cs Fri Apr 6 23:37:48 2012
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Search.Highlight
+{
+
+ /// <summary> Implements the policy for breaking text into multiple fragments for consideration
+ /// by the <see cref="Highlighter"/> class. A sophisticated implementation may do this on the basis
+ /// of detecting end of sentences in the text.
+ /// </summary>
+ /// <author> mark@searcharea.co.uk
+ /// </author>
+ public interface IFragmenter
+ {
+ /// <summary>
+ /// Initializes the Fragmenter. You can grab references to the Attributes you are
+ /// interested in from tokenStream and then access the values in {@link #isNewFragment()}.
+ /// </summary>
+ /// <param name="originalText">the original source text</param>
+ ///<param name="tokenStream">tokenStream the <see cref="TokenStream" /> to be fragmented</param>
+ void Start(string originalText, TokenStream tokenStream);
+
+ /// <summary>
+ /// Test to see if this token from the stream should be held in a new
+ /// TextFragment. Every time this is called, the TokenStream
+ /// passed to start(String, TokenStream) will have been incremented.
+ /// </summary>
+ bool IsNewFragment();
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/src/contrib/Highlighter/IScorer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/IScorer.cs?rev=1310635&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/IScorer.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/IScorer.cs Fri Apr 6 23:37:48 2012
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Search.Highlight
+{
+ /// <summary> Adds to the score for a fragment based on its tokens</summary>
+ public interface IScorer
+ {
+ /// <summary>
+ /// Called to init the Scorer with a {@link TokenStream}. You can grab references to
+ /// the attributes you are interested in here and access them from {@link #getTokenScore()}.
+ /// </summary>
+ /// <param name="tokenStream">the {@link TokenStream} that will be scored.</param>
+ /// <returns>
+ /// either a {@link TokenStream} that the Highlighter should continue using (eg
+ /// if you read the tokenSream in this method) or null to continue
+ /// using the same {@link TokenStream} that was passed in.
+ /// </returns>
+ /// <exception cref="IOException"></exception>
+ ///
+ TokenStream Init(TokenStream tokenStream);
+
+ /// <summary>
+ /// Called when a new fragment is started for consideration.
+ /// </summary>
+ /// <param name="newFragment">the fragment that will be scored next</param>
+ void StartFragment(TextFragment newFragment);
+
+ /// <summary>
+ /// Called for each token in the current fragment. The {@link Highlighter} will
+ /// increment the {@link TokenStream} passed to init on every call.
+ /// </summary>
+ /// <returns>a score which is passed to the {@link Highlighter} class to influence the
+ /// mark-up of the text (this return value is NOT used to score the
+ /// fragment)</returns>
+ float GetTokenScore();
+
+ ///<summary>
+ /// Called when the {@link Highlighter} has no more tokens for the current fragment -
+ /// the Scorer returns the weighting it has derived for the most recent
+ /// fragment, typically based on the results of {@link #getTokenScore()}.
+ /// </summary>
+ float GetFragmentScore();
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/NullFragmenter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/NullFragmenter.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/NullFragmenter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/NullFragmenter.cs Fri Apr 6 23:37:48 2012
@@ -15,12 +15,9 @@
* limitations under the License.
*/
-using System;
using Lucene.Net.Analysis;
-using Lucene.Net.Search.Highlight;
-using Token = Lucene.Net.Analysis.Token;
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
/// <summary> <see cref="IFragmenter"/> implementation which does not fragment the text.
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/QueryScorer.cs Fri Apr 6 23:37:48 2012
@@ -19,7 +19,6 @@ using System;
using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
-using Lucene.Net.Highlight;
using Lucene.Net.Index;
using Lucene.Net.Search.Spans;
using Lucene.Net.Support;
@@ -51,114 +50,102 @@ namespace Lucene.Net.Search.Highlight
private bool skipInitExtractor;
private bool wrapToCaching = true;
- /**
- * @param query Query to use for highlighting
- */
-
+ /// <summary>
+ /// Constructs a new QueryScorer instance
+ /// </summary>
+ /// <param name="query">Query to use for highlighting</param>
public QueryScorer(Query query)
{
- init(query, null, null, true);
+ Init(query, null, null, true);
}
- /**
- * @param query Query to use for highlighting
- * @param field Field to highlight - pass null to ignore fields
- */
-
+ /// <summary>
+ /// Constructs a new QueryScorer instance
+ /// </summary>
+ /// <param name="query">Query to use for highlighting</param>
+ /// <param name="field">Field to highlight - pass null to ignore fields</param>
public QueryScorer(Query query, String field)
{
- init(query, field, null, true);
+ Init(query, field, null, true);
}
- /**
- * @param query Query to use for highlighting
- * @param field Field to highlight - pass null to ignore fields
- * @param reader {@link IndexReader} to use for quasi tf/idf scoring
- */
-
+ /// <summary>
+ /// Constructs a new QueryScorer instance
+ /// </summary>
+ /// <param name="query">Query to use for highlighting</param>
+ /// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param>
+ /// <param name="field">Field to highlight - pass null to ignore fields</param>
public QueryScorer(Query query, IndexReader reader, String field)
{
- init(query, field, reader, true);
+ Init(query, field, reader, true);
}
-
- /**
- * @param query to use for highlighting
- * @param reader {@link IndexReader} to use for quasi tf/idf scoring
- * @param field to highlight - pass null to ignore fields
- * @param defaultField
- */
-
+ /// <summary>
+ /// Constructs a new QueryScorer instance
+ /// </summary>
+ /// <param name="query">Query to use for highlighting</param>
+ /// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param>
+ /// <param name="field">Field to highlight - pass null to ignore fields</param>
+ /// <param name="defaultField">The default field for queries with the field name unspecified</param>
public QueryScorer(Query query, IndexReader reader, String field, String defaultField)
{
this.defaultField = StringHelper.Intern(defaultField);
- init(query, field, reader, true);
+ Init(query, field, reader, true);
}
- /**
- * @param defaultField - The default field for queries with the field name unspecified
- */
+ /// <summary>
+ /// Constructs a new QueryScorer instance
+ /// </summary>
+ /// <param name="query">Query to use for highlighting</param>
+ /// <param name="field">Field to highlight - pass null to ignore fields</param>
+ /// <param name="defaultField">The default field for queries with the field name unspecified</param>
public QueryScorer(Query query, String field, String defaultField)
{
this.defaultField = StringHelper.Intern(defaultField);
- init(query, field, null, true);
+ Init(query, field, null, true);
}
- /**
- * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s
- */
-
+ /// <summary>
+ /// Constructs a new QueryScorer instance
+ /// </summary>
+ /// <param name="weightedTerms">an array of pre-created <see cref="WeightedSpanTerm"/>s</param>
public QueryScorer(WeightedSpanTerm[] weightedTerms)
{
this.fieldWeightedSpanTerms = new HashMap<String, WeightedSpanTerm>(weightedTerms.Length);
- for (int i = 0; i < weightedTerms.Length; i++)
+ foreach (WeightedSpanTerm t in weightedTerms)
{
- WeightedSpanTerm existingTerm = fieldWeightedSpanTerms[weightedTerms[i].term];
+ WeightedSpanTerm existingTerm = fieldWeightedSpanTerms[t.Term];
if ((existingTerm == null) ||
- (existingTerm.weight < weightedTerms[i].weight))
+ (existingTerm.Weight < t.Weight))
{
// if a term is defined more than once, always use the highest
- // scoring weight
- fieldWeightedSpanTerms[weightedTerms[i].term] = weightedTerms[i];
- maxTermWeight = Math.Max(maxTermWeight, weightedTerms[i].GetWeight());
+ // scoring Weight
+ fieldWeightedSpanTerms[t.Term] = t;
+ maxTermWeight = Math.Max(maxTermWeight, t.Weight);
}
}
skipInitExtractor = true;
}
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
- */
-
- public float getFragmentScore()
+ /// <seealso cref="IScorer.GetFragmentScore()"/>
+ public float GetFragmentScore()
{
return totalScore;
}
- /**
- *
- * @return The highest weighted term (useful for passing to
- * GradientFormatter to set top end of coloring scale).
- */
-
- public float getMaxTermWeight()
+ /// <summary>
+ /// The highest weighted term (useful for passing to GradientFormatter to set top end of coloring scale).
+ /// </summary>
+ public float GetMaxTermWeight()
{
return maxTermWeight;
}
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
- * int)
- */
-
- public float getTokenScore()
+ /// <seealso cref="IScorer.GetTokenScore"/>
+ public float GetTokenScore()
{
position += posIncAtt.PositionIncrement;
String termText = termAtt.Term();
@@ -170,13 +157,13 @@ namespace Lucene.Net.Search.Highlight
return 0;
}
- if (weightedSpanTerm.isPositionSensitive() &&
- !weightedSpanTerm.checkPosition(position))
+ if (weightedSpanTerm.IsPositionSensitive() &&
+ !weightedSpanTerm.CheckPosition(position))
{
return 0;
}
- float score = weightedSpanTerm.GetWeight();
+ float score = weightedSpanTerm.Weight;
// found a query term - is it unique in this doc?
if (!foundTerms.Contains(termText))
@@ -188,11 +175,8 @@ namespace Lucene.Net.Search.Highlight
return score;
}
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
- */
-
- public TokenStream init(TokenStream tokenStream)
+ /// <seealso cref="IScorer.Init"/>
+ public TokenStream Init(TokenStream tokenStream)
{
position = -1;
termAtt = tokenStream.AddAttribute<TermAttribute>();
@@ -203,28 +187,23 @@ namespace Lucene.Net.Search.Highlight
{
fieldWeightedSpanTerms.Clear();
}
- return initExtractor(tokenStream);
+ return InitExtractor(tokenStream);
}
return null;
}
- /**
- * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing
- * Span information to a {@link Fragmenter}.
- *
- * @param token to get {@link WeightedSpanTerm} for
- * @return WeightedSpanTerm for token
- */
-
- public WeightedSpanTerm getWeightedSpanTerm(String token)
+ /// <summary>
+ /// Retrieve the <see cref="WeightedSpanTerm"/> for the specified token. Useful for passing
+ /// Span information to a <see cref="IFragmenter"/>.
+ /// </summary>
+ /// <param name="token">token to get {@link WeightedSpanTerm} for</param>
+ /// <returns>WeightedSpanTerm for token</returns>
+ public WeightedSpanTerm GetWeightedSpanTerm(String token)
{
return fieldWeightedSpanTerms[token];
}
-
- /**
- */
-
- private void init(Query query, String field, IndexReader reader, bool expandMultiTermQuery)
+
+ private void Init(Query query, String field, IndexReader reader, bool expandMultiTermQuery)
{
this.reader = reader;
this.expandMultiTermQuery = expandMultiTermQuery;
@@ -232,76 +211,57 @@ namespace Lucene.Net.Search.Highlight
this.field = field;
}
- private TokenStream initExtractor(TokenStream tokenStream)
+ private TokenStream InitExtractor(TokenStream tokenStream)
{
WeightedSpanTermExtractor qse = defaultField == null
? new WeightedSpanTermExtractor()
: new WeightedSpanTermExtractor(defaultField);
- qse.setExpandMultiTermQuery(expandMultiTermQuery);
- qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
+ qse.SetExpandMultiTermQuery(expandMultiTermQuery);
+ qse.SetWrapIfNotCachingTokenFilter(wrapToCaching);
if (reader == null)
{
- this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
+ this.fieldWeightedSpanTerms = qse.GetWeightedSpanTerms(query,
tokenStream, field);
}
else
{
- this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query,
+ this.fieldWeightedSpanTerms = qse.GetWeightedSpanTermsWithScores(query,
tokenStream, field, reader);
}
- if (qse.isCachedTokenStream())
+ if (qse.IsCachedTokenStream())
{
- return qse.getTokenStream();
+ return qse.GetTokenStream();
}
return null;
}
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
- */
-
- public void startFragment(TextFragment newFragment)
+ /// <seealso cref="IScorer.StartFragment"/>
+ public void StartFragment(TextFragment newFragment)
{
foundTerms = new HashSet<String>();
totalScore = 0;
}
- /**
- * @return true if multi-term queries should be expanded
- */
-
- public bool isExpandMultiTermQuery()
- {
- return expandMultiTermQuery;
- }
-
- /**
- * Controls whether or not multi-term queries are expanded
- * against a {@link MemoryIndex} {@link IndexReader}.
- *
- * @param expandMultiTermQuery true if multi-term queries should be expanded
- */
-
- public void setExpandMultiTermQuery(bool expandMultiTermQuery)
- {
- this.expandMultiTermQuery = expandMultiTermQuery;
- }
-
- /**
- * By default, {@link TokenStream}s that are not of the type
- * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
- * ensure an efficient reset - if you are already using a different caching
- * {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false.
- *
- * @param wrap
- */
-
- public void setWrapIfNotCachingTokenFilter(bool wrap)
+ /// <summary>
+ /// Controls whether or not multi-term queries are expanded
+ /// against a <see cref="MemoryIndex"/> <see cref="IndexReader"/>.
+ /// </summary>
+ public bool IsExpandMultiTermQuery
+ {
+ get { return expandMultiTermQuery; }
+ set { this.expandMultiTermQuery = value; }
+ }
+
+ /// <summary>
+ /// By default, <see cref="TokenStream"/>s that are not of the type
+ /// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to
+ /// ensure an efficient reset - if you are already using a different caching
+ /// <see cref="TokenStream"/> impl and you don't want it to be wrapped, set this to
+ /// false.
+ /// </summary>
+ public void SetWrapIfNotCachingTokenFilter(bool wrap)
{
this.wrapToCaching = wrap;
}
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermExtractor.cs Fri Apr 6 23:37:48 2012
@@ -19,7 +19,6 @@ using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
-using Lucene.Net.Highlight;
using Lucene.Net.Index;
using Lucene.Net.Util;
@@ -61,7 +60,7 @@ namespace Lucene.Net.Search.Highlight
{
try
{
- int docFreq = reader.DocFreq(new Term(fieldName, t.term));
+ int docFreq = reader.DocFreq(new Term(fieldName, t.Term));
// docFreq counts deletes
if (totalNumDocs < docFreq)
{
@@ -69,7 +68,7 @@ namespace Lucene.Net.Search.Highlight
}
//IDF algorithm taken from DefaultSimilarity class
var idf = (float)(Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0);
- t.weight *= idf;
+ t.Weight *= idf;
}
catch (IOException e)
{
@@ -152,8 +151,8 @@ namespace Lucene.Net.Search.Highlight
BooleanClause[] queryClauses = query.GetClauses();
for (int i = 0; i < queryClauses.Length; i++)
{
- if (prohibited || queryClauses[i].GetOccur() != BooleanClause.Occur.MUST_NOT)
- GetTerms(queryClauses[i].GetQuery(), terms, prohibited, fieldName);
+ if (prohibited || queryClauses[i].Occur != Occur.MUST_NOT)
+ GetTerms(queryClauses[i].Query, terms, prohibited, fieldName);
}
}
private static void GetTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName)
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermScorer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermScorer.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermScorer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/QueryTermScorer.cs Fri Apr 6 23:37:48 2012
@@ -4,7 +4,6 @@ using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
-using Lucene.Net.Highlight;
using Lucene.Net.Index;
using Lucene.Net.Support;
@@ -72,14 +71,14 @@ namespace Lucene.Net.Search.Highlight
termsToFind = new HashMap<String, WeightedTerm>();
for (int i = 0; i < weightedTerms.Length; i++)
{
- WeightedTerm existingTerm = termsToFind[weightedTerms[i].term];
+ WeightedTerm existingTerm = termsToFind[weightedTerms[i].Term];
if ((existingTerm == null)
- || (existingTerm.weight < weightedTerms[i].weight))
+ || (existingTerm.Weight < weightedTerms[i].Weight))
{
// if a term is defined more than once, always use the highest scoring
- // weight
- termsToFind[weightedTerms[i].term] = weightedTerms[i];
- maxTermWeight = Math.Max(maxTermWeight, weightedTerms[i].GetWeight());
+ // Weight
+ termsToFind[weightedTerms[i].Term] = weightedTerms[i];
+ maxTermWeight = Math.Max(maxTermWeight, weightedTerms[i].Weight);
}
}
}
@@ -88,7 +87,7 @@ namespace Lucene.Net.Search.Highlight
* @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
*/
- public TokenStream init(TokenStream tokenStream)
+ public TokenStream Init(TokenStream tokenStream)
{
termAtt = tokenStream.AddAttribute<TermAttribute>();
return null;
@@ -102,7 +101,7 @@ namespace Lucene.Net.Search.Highlight
* .lucene.search.highlight.TextFragment)
*/
- public void startFragment(TextFragment newFragment)
+ public void StartFragment(TextFragment newFragment)
{
uniqueTermsInFragment = new HashSet<String>();
currentTextFragment = newFragment;
@@ -115,7 +114,7 @@ namespace Lucene.Net.Search.Highlight
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
*/
- public float getTokenScore()
+ public float GetTokenScore()
{
String termText = termAtt.Term();
@@ -128,10 +127,10 @@ namespace Lucene.Net.Search.Highlight
// found a query term - is it unique in this doc?
if (!uniqueTermsInFragment.Contains(termText))
{
- totalScore += queryTerm.GetWeight();
+ totalScore += queryTerm.Weight;
uniqueTermsInFragment.Add(termText);
}
- return queryTerm.GetWeight();
+ return queryTerm.Weight;
}
@@ -139,7 +138,7 @@ namespace Lucene.Net.Search.Highlight
* @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
*/
- public float getFragmentScore()
+ public float GetFragmentScore()
{
return totalScore;
}
@@ -151,7 +150,7 @@ namespace Lucene.Net.Search.Highlight
* org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
*/
- public void allFragmentsProcessed()
+ public void AllFragmentsProcessed()
{
// this class has no special operations to perform at end of processing
}
@@ -162,7 +161,7 @@ namespace Lucene.Net.Search.Highlight
* to set top end of coloring scale.
*/
- public float getMaxTermWeight()
+ public float GetMaxTermWeight()
{
return maxTermWeight;
}
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleFragmenter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleFragmenter.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleFragmenter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleFragmenter.cs Fri Apr 6 23:37:48 2012
@@ -18,10 +18,8 @@
using System;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
-using Lucene.Net.Search.Highlight;
-using Token = Lucene.Net.Analysis.Token;
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
/// <summary> <see cref="IFragmenter"/> implementation which breaks text up into same-size
@@ -81,7 +79,7 @@ namespace Lucene.Net.Highlight
* @return size in number of characters of each fragment
*/
- public int getFragmentSize()
+ public int GetFragmentSize()
{
return fragmentSize;
}
@@ -90,7 +88,7 @@ namespace Lucene.Net.Highlight
* @param size size in characters of each fragment
*/
- public void setFragmentSize(int size)
+ public void SetFragmentSize(int size)
{
fragmentSize = size;
}
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLEncoder.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLEncoder.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLEncoder.cs Fri Apr 6 23:37:48 2012
@@ -17,9 +17,8 @@
using System;
using System.Text;
-using Lucene.Net.Search.Highlight;
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
/// <summary> Simple <see cref="IEncoder"/> implementation to escape text for HTML output</summary>
public class SimpleHTMLEncoder : IEncoder
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLFormatter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLFormatter.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLFormatter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleHTMLFormatter.cs Fri Apr 6 23:37:48 2012
@@ -15,10 +15,7 @@
* limitations under the License.
*/
-using System;
-using Lucene.Net.Search.Highlight;
-
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
/// <summary> Simple <see cref="IFormatter"/> implementation to highlight terms with a pre and post tag</summary>
/// <author> MAHarwood
Added: incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleSpanFragmenter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleSpanFragmenter.cs?rev=1310635&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleSpanFragmenter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/SimpleSpanFragmenter.cs Fri Apr 6 23:37:48 2012
@@ -0,0 +1,91 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Search.Highlight
+{
+ public class SimpleSpanFragmenter : IFragmenter
+ {
+ private static int DEFAULT_FRAGMENT_SIZE = 100;
+ private int fragmentSize;
+ private int currentNumFrags;
+ private int position = -1;
+ private QueryScorer queryScorer;
+ private int waitForPos = -1;
+ private int textSize;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncAtt;
+ private OffsetAttribute offsetAtt;
+
+ /// <param name="queryScorer">QueryScorer that was used to score hits</param>
+ public SimpleSpanFragmenter(QueryScorer queryScorer)
+ : this(queryScorer, DEFAULT_FRAGMENT_SIZE)
+ {
+
+ }
+
+ /// <param name="queryScorer">QueryScorer that was used to score hits</param>
+ /// <param name="fragmentSize">size in bytes of each fragment</param>
+ public SimpleSpanFragmenter(QueryScorer queryScorer, int fragmentSize)
+ {
+ this.fragmentSize = fragmentSize;
+ this.queryScorer = queryScorer;
+ }
+
+ /// <seealso cref="IFragmenter.IsNewFragment"/>
+ public bool IsNewFragment()
+ {
+ position += posIncAtt.PositionIncrement;
+
+ if (waitForPos == position)
+ {
+ waitForPos = -1;
+ }
+ else if (waitForPos != -1)
+ {
+ return false;
+ }
+
+ WeightedSpanTerm wSpanTerm = queryScorer.GetWeightedSpanTerm(termAtt.Term());
+
+ if (wSpanTerm != null)
+ {
+ List<PositionSpan> positionSpans = wSpanTerm.GetPositionSpans();
+
+ for (int i = 0; i < positionSpans.Count; i++)
+ {
+ if (positionSpans[i].Start == position)
+ {
+ waitForPos = positionSpans[i].End + 1;
+ break;
+ }
+ }
+ }
+
+ bool isNewFrag = offsetAtt.EndOffset >= (fragmentSize*currentNumFrags)
+ && (textSize - offsetAtt.EndOffset) >= ((uint) fragmentSize >> 1);
+
+
+ if (isNewFrag)
+ {
+ currentNumFrags++;
+ }
+
+ return isNewFrag;
+ }
+
+ /// <seealso cref="IFragmenter.Start(string, TokenStream)"/>
+ public void Start(String originalText, TokenStream tokenStream)
+ {
+ position = -1;
+ currentNumFrags = 1;
+ textSize = originalText.Length;
+ termAtt = tokenStream.AddAttribute<TermAttribute>();
+ posIncAtt = tokenStream.AddAttribute<PositionIncrementAttribute>();
+ offsetAtt = tokenStream.AddAttribute<OffsetAttribute>();
+ }
+ }
+}
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/SpanGradientFormatter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/SpanGradientFormatter.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/SpanGradientFormatter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/SpanGradientFormatter.cs Fri Apr 6 23:37:48 2012
@@ -16,61 +16,58 @@
*/
using System;
+using System.Text;
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
- /// <summary> Formats text with different color intensity depending on the score of the
- /// term using the span tag. GradientFormatter uses a bgcolor argument to the font tag which
- /// doesn't work in Mozilla, thus this class.
- ///
- /// </summary>
- /// <seealso cref="GradientFormatter">
- /// </seealso>
- /// <author> David Spencer dave@searchmorph.com
- /// </author>
-
- public class SpanGradientFormatter:GradientFormatter
- {
- public SpanGradientFormatter(float maxScore, System.String minForegroundColor, System.String maxForegroundColor, System.String minBackgroundColor, System.String maxBackgroundColor):base(maxScore, minForegroundColor, maxForegroundColor, minBackgroundColor, maxBackgroundColor)
- {
- }
-
-
-
- public override System.String HighlightTerm(System.String originalText, TokenGroup tokenGroup)
- {
- if (tokenGroup.GetTotalScore() == 0)
- return originalText;
- float score = tokenGroup.GetTotalScore();
- if (score == 0)
- {
- return originalText;
- }
-
- // try to size sb correctly
- System.Text.StringBuilder sb = new System.Text.StringBuilder(originalText.Length + EXTRA);
-
- sb.Append("<span style=\"");
- if (highlightForeground)
- {
- sb.Append("color: ");
- sb.Append(GetForegroundColorString(score));
- sb.Append("; ");
- }
- if (highlightBackground)
- {
- sb.Append("background: ");
- sb.Append(GetBackgroundColorString(score));
- sb.Append("; ");
- }
- sb.Append("\">");
- sb.Append(originalText);
- sb.Append("</span>");
- return sb.ToString();
- }
-
- // guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuffer resize
- private const System.String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
- private static readonly int EXTRA = TEMPLATE.Length;
- }
+ /// <summary>
+ /// Formats text with different color intensity depending on the score of the
+ /// term using the span tag. GradientFormatter uses a bgcolor argument to the font tag which
+ /// doesn't work in Mozilla, thus this class.
+ /// </summary>
+ /// <seealso cref="GradientFormatter"/>
+ public class SpanGradientFormatter : GradientFormatter
+ {
+ // guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
+ private static readonly String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
+ private static readonly int EXTRA = TEMPLATE.Length;
+
+ public SpanGradientFormatter(float maxScore, String minForegroundColor,
+ String maxForegroundColor, String minBackgroundColor,
+ String maxBackgroundColor)
+ : base(maxScore, minForegroundColor, maxForegroundColor, minBackgroundColor, maxBackgroundColor)
+ { }
+
+ public override String HighlightTerm(String originalText, TokenGroup tokenGroup)
+ {
+ if (tokenGroup.GetTotalScore() == 0)
+ return originalText;
+ float score = tokenGroup.GetTotalScore();
+ if (score == 0)
+ {
+ return originalText;
+ }
+
+ // try to size sb correctly
+ var sb = new StringBuilder(originalText.Length + EXTRA);
+
+ sb.Append("<span style=\"");
+ if (highlightForeground)
+ {
+ sb.Append("color: ");
+ sb.Append(GetForegroundColorString(score));
+ sb.Append("; ");
+ }
+ if (highlightBackground)
+ {
+ sb.Append("background: ");
+ sb.Append(GetBackgroundColorString(score));
+ sb.Append("; ");
+ }
+ sb.Append("\">");
+ sb.Append(originalText);
+ sb.Append("</span>");
+ return sb.ToString();
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/TextFragment.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/TextFragment.cs?rev=1310635&r1=1310634&r2=1310635&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/TextFragment.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/TextFragment.cs Fri Apr 6 23:37:48 2012
@@ -16,66 +16,58 @@
*/
using System;
+using System.Text;
-namespace Lucene.Net.Highlight
+namespace Lucene.Net.Search.Highlight
{
- /// <summary> Low-level class used to record information about a section of a document
- /// with a score.
- /// </summary>
- /// <author> MAHarwood
- ///
- ///
- /// </author>
- public class TextFragment
- {
- internal System.Text.StringBuilder markedUpText;
- internal int fragNum;
- internal int textStartPos;
- internal int textEndPos;
- internal float score;
-
- public TextFragment(System.Text.StringBuilder markedUpText, int textStartPos, int fragNum)
- {
- this.markedUpText = markedUpText;
- this.textStartPos = textStartPos;
- this.fragNum = fragNum;
- }
- internal virtual void SetScore(float score)
- {
- this.score = score;
- }
- public virtual float GetScore()
- {
- return score;
- }
- /// <param name="frag2">Fragment to be merged into this one
- /// </param>
- public virtual void Merge(TextFragment frag2)
- {
- textEndPos = frag2.textEndPos;
- score = System.Math.Max(score, frag2.score);
- }
- /// <param name="fragment">
- /// </param>
- /// <returns> true if this fragment follows the one passed
- /// </returns>
- public virtual bool Follows(TextFragment fragment)
- {
- return textStartPos == fragment.textEndPos;
- }
-
- /// <returns> the fragment sequence number
- /// </returns>
- public virtual int GetFragNum()
- {
- return fragNum;
- }
-
- /* Returns the marked-up text for this text fragment
- */
- public override System.String ToString()
- {
- return markedUpText.ToString(textStartPos, textEndPos - textStartPos);
- }
- }
+ /// <summary> Low-level class used to record information about a section of a document
+ /// with a score.
+ /// </summary>
+ public class TextFragment
+ {
+ private StringBuilder markedUpText;
+
+
+ public TextFragment(StringBuilder markedUpText, int textStartPos, int fragNum)
+ {
+ this.markedUpText = markedUpText;
+ this.TextStartPos = textStartPos;
+ this.FragNum = fragNum;
+ }
+
+ public float Score { get; protected internal set; }
+ public int TextEndPos { get; protected internal set; }
+ public int TextStartPos { get; protected internal set; }
+
+ /// <summary>
+ /// the fragment sequence number
+ /// </summary>
+ public int FragNum { get; protected internal set; }
+
+
+ /// <summary></summary>
+ /// <param name="frag2">Fragment to be merged into this one</param>
+ public void Merge(TextFragment frag2)
+ {
+ TextEndPos = frag2.TextEndPos;
+ Score = Math.Max(Score, frag2.Score);
+ }
+
+ /// <summary>
+ /// true if this fragment follows the one passed
+ /// </summary>
+ public bool Follows(TextFragment fragment)
+ {
+ return TextStartPos == fragment.TextEndPos;
+ }
+
+ /// <summary>
+ /// Returns the marked-up text for this text fragment
+ /// </summary>
+ public override String ToString()
+ {
+ return markedUpText.ToString(TextStartPos, TextEndPos - TextStartPos);
+ }
+
+ }
}
\ No newline at end of file