You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2013/09/06 00:11:23 UTC
[lucy-commits] [4/4] git commit: refs/heads/cfish-string-prep1 - Rework Highlighter
to use StringIterators
Rework Highlighter to use StringIterators
Don't scan the whole document for sentences but find sentence or word
boundaries directly from the start and end of a fragment using string
iterators. Merge Find_Best_Fragment into Raw_Excerpt.
As a side effect, this should fix for LUCY-199 in an acceptable way. It
would still be nice to have a third class of boundaries which breaks on
punctuation and symbols, so we'd find boundaries like this:
* Try to break on sentence boundaries.
* If no sentence boundary can be found, try to break on whitespace.
* If no whitespace can be found, try to break und punctuation and
symbols.
We'd need a way to lookup Unicode general categories for this to work,
though.
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/1f51cae0
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/1f51cae0
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/1f51cae0
Branch: refs/heads/cfish-string-prep1
Commit: 1f51cae0f9e6df27bcb9c10c902fda5a8c5782a1
Parents: 52bab25
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Sep 5 01:34:40 2013 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Thu Sep 5 23:53:17 2013 +0200
----------------------------------------------------------------------
core/Lucy/Highlight/Highlighter.c | 577 +++++++++---------------
core/Lucy/Highlight/Highlighter.cfh | 34 +-
core/Lucy/Test/Highlight/TestHighlighter.c | 206 ++-------
3 files changed, 253 insertions(+), 564 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucy/blob/1f51cae0/core/Lucy/Highlight/Highlighter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Highlight/Highlighter.c b/core/Lucy/Highlight/Highlighter.c
index d98c027..a4529e7 100644
--- a/core/Lucy/Highlight/Highlighter.c
+++ b/core/Lucy/Highlight/Highlighter.c
@@ -60,7 +60,6 @@ Highlighter_init(Highlighter *self, Searcher *searcher, Obj *query,
ivars->field = Str_Clone(field);
ivars->excerpt_length = excerpt_length;
ivars->slop = excerpt_length / 3;
- ivars->window_width = excerpt_length + (ivars->slop * 2);
ivars->pre_tag = Str_new_from_trusted_utf8("<strong>", 8);
ivars->post_tag = Str_new_from_trusted_utf8("</strong>", 9);
if (Query_Is_A(ivars->query, COMPILER)) {
@@ -165,7 +164,6 @@ Highlighter_Create_Excerpt_IMP(Highlighter *self, HitDoc *hit_doc) {
return Str_new(0);
}
else {
- StackString *fragment = SSTR_WRAP((String*)field_val);
DocVector *doc_vec
= Searcher_Fetch_Doc_Vec(ivars->searcher,
HitDoc_Get_Doc_ID(hit_doc));
@@ -176,22 +174,15 @@ Highlighter_Create_Excerpt_IMP(Highlighter *self, HitDoc *hit_doc) {
VA_Sort(score_spans, NULL, NULL);
HeatMap *heat_map
= HeatMap_new(score_spans, (ivars->excerpt_length * 2) / 3);
- int32_t top
- = Highlighter_Find_Best_Fragment(self, (String*)field_val,
- (ViewCharBuf*)fragment, heat_map);
- VArray *sentences
- = Highlighter_Find_Sentences(self, (String*)field_val, 0,
- top + ivars->window_width);
+ int32_t top;
String *raw_excerpt
- = Highlighter_Raw_Excerpt(self, (String*)field_val,
- (String*)fragment, &top, heat_map,
- sentences);
+ = Highlighter_Raw_Excerpt(self, (String*)field_val, &top,
+ heat_map);
String *highlighted
= Highlighter_Highlight_Excerpt(self, score_spans, raw_excerpt,
top);
- DECREF(sentences);
DECREF(heat_map);
DECREF(score_spans);
DECREF(doc_vec);
@@ -216,294 +207,259 @@ S_hottest(HeatMap *heat_map) {
return retval;
}
-int32_t
-Highlighter_Find_Best_Fragment_IMP(Highlighter *self,
- const String *field_val,
- ViewCharBuf *fragment, HeatMap *heat_map) {
- HighlighterIVARS *const ivars = Highlighter_IVARS(self);
+// Find a starting boundary after the current position given by the iterator.
+// Skip up to max_skip code points plus potential whitespace. Update the
+// iterator and return number of code points skipped. Return true if a
+// starting edge (sentence) was found.
+bool
+S_find_starting_boundary(StringIterator *top, uint32_t max_skip,
+ uint32_t *num_skipped_ptr) {
+ // Keep track of the first word boundary.
+ StringIterator *word = NULL;
+ uint32_t word_offset = 0;
+
+ // Check if we're at a starting boundary already.
+
+ StringIterator *iter = (StringIterator*)StrIter_Clone(top);
+
+ while (true) {
+ uint32_t code_point = StrIter_Prev(iter);
+
+ if (code_point == STRITER_DONE || code_point == '.') {
+ // Skip remaining whitespace.
+ *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top);
+ DECREF(iter);
+ return true;
+ }
+
+ if (StrHelp_is_whitespace(code_point)) {
+ if (word == NULL) { word = (StringIterator*)StrIter_Clone(top); }
+ }
+ else {
+ break;
+ }
+ }
- // Window is 1.66 * excerpt_length, with the loc in the middle.
- int32_t best_location = S_hottest(heat_map);
+ // Try to start on a boundary.
- if (best_location < (int32_t)ivars->slop) {
- // If the beginning of the string falls within the window centered
- // around the hottest point in the field, start the fragment at the
- // beginning.
- ViewCB_Assign(fragment, (String*)field_val);
- int32_t top = ViewCB_Trim_Top(fragment);
- ViewCB_Truncate(fragment, ivars->window_width);
- return top;
+ uint32_t num_skipped = 0;
+ bool found_edge = false;
+
+ StrIter_Assign(iter, top);
+
+ for (uint32_t i = 0; i < max_skip; ++i) {
+ uint32_t code_point = StrIter_Next(iter);
+
+ if (code_point == STRITER_DONE || code_point == '.') {
+ found_edge = true;
+ StrIter_Assign(top, iter);
+ num_skipped = i + 1;
+ break;
+ }
+
+ if (word == NULL && StrHelp_is_whitespace(code_point)) {
+ word = (StringIterator*)StrIter_Clone(iter);
+ word_offset = i + 1;
+ }
}
- else {
- int32_t top = best_location - ivars->slop;
- ViewCB_Assign(fragment, (String*)field_val);
- ViewCB_Nip(fragment, top);
- top += ViewCB_Trim_Top(fragment);
- int32_t chars_left = ViewCB_Truncate(fragment, ivars->excerpt_length);
- int32_t overrun = ivars->excerpt_length - chars_left;
-
- if (!overrun) {
- // We've found an acceptable window.
- ViewCB_Assign(fragment, (String*)field_val);
- ViewCB_Nip(fragment, top);
- top += ViewCB_Trim_Top(fragment);
- ViewCB_Truncate(fragment, ivars->window_width);
- return top;
+
+ // Try to use word boundary if no sentence boundary was found.
+ if (!found_edge && word != NULL) {
+ StrIter_Assign(top, word);
+ num_skipped = word_offset;
+ }
+
+ // Skip remaining whitespace.
+ num_skipped += StrIter_Skip_Next_Whitespace(top);
+ *num_skipped_ptr = num_skipped;
+
+ DECREF(word);
+ DECREF(iter);
+ return found_edge;
+}
+
+// Find an ending boundary before the current position given by the iterator.
+// Skip up to max_skip code points plus potential whitespace. Update the
+// iterator and return number of code points skipped. Return true if a
+// ending edge (sentence) was found.
+bool
+S_find_ending_boundary(StringIterator *tail, uint32_t max_skip,
+ uint32_t *num_skipped_ptr) {
+ uint32_t code_point;
+
+ // Check if we're at an ending boundary already. Don't check for a word
+ // boundary because we need space for a trailing ellipsis.
+
+ StringIterator *iter = (StringIterator*)StrIter_Clone(tail);
+
+ do {
+ code_point = StrIter_Next(iter);
+
+ if (code_point == STRITER_DONE) {
+ // Skip remaining whitespace.
+ *num_skipped_ptr = StrIter_Skip_Prev_Whitespace(tail);
+ DECREF(iter);
+ return true;
}
- else if (overrun > top) {
- // The field is very short, so make the whole field the
- // "fragment".
- ViewCB_Assign(fragment, (String*)field_val);
- return ViewCB_Trim_Top(fragment);
+ } while (StrHelp_is_whitespace(code_point));
+
+ // Keep track of the first word boundary.
+ StringIterator *word = NULL;
+ uint32_t word_offset = 0;
+
+ StrIter_Assign(iter, tail);
+
+ for (uint32_t i = 0;
+ STRITER_DONE != (code_point = StrIter_Prev(iter));
+ ++i)
+ {
+ if (code_point == '.') {
+ StrIter_Assign(tail, iter);
+ StrIter_Advance(tail, 1); // Include period.
+ *num_skipped_ptr = i;
+ DECREF(word);
+ DECREF(iter);
+ return true;
}
- else {
- // The fragment is too close to the end, so slide it back.
- top -= overrun;
- ViewCB_Assign(fragment, (String*)field_val);
- ViewCB_Nip(fragment, top);
- top += ViewCB_Trim_Top(fragment);
- ViewCB_Truncate(fragment, ivars->excerpt_length);
- return top;
+
+ if (StrHelp_is_whitespace(code_point)) {
+ if (word == NULL) {
+ word = (StringIterator*)StrIter_Clone(iter);
+ word_offset = i + 1;
+ }
+ }
+ else if (i >= max_skip) {
+ // Break only at non-whitespace to allow another sentence
+ // boundary to be found.
+ break;
}
}
-}
-// Return true if the window represented by "offset" and "length" overlaps a
-// score span, or if there are no score spans so that no excerpt is measurably
-// superior.
-static bool
-S_has_heat(HeatMap *heat_map, int32_t offset, int32_t length) {
- VArray *spans = HeatMap_Get_Spans(heat_map);
- uint32_t num_spans = VA_Get_Size(spans);
- int32_t end = offset + length;
-
- if (length == 0) { return false; }
- if (num_spans == 0) { return true; }
-
- for (uint32_t i = 0; i < num_spans; i++) {
- Span *span = (Span*)VA_Fetch(spans, i);
- int32_t span_start = Span_Get_Offset(span);
- int32_t span_end = span_start + Span_Get_Length(span);;
- if (offset >= span_start && offset < span_end) { return true; }
- if (end > span_start && end <= span_end) { return true; }
- if (offset <= span_start && end >= span_end) { return true; }
- if (span_start > end) { break; }
+ if (word == NULL) {
+ // Make space for ellipsis.
+ *num_skipped_ptr = StrIter_Recede(tail, 1);
+ }
+ else {
+ // Use word boundary if no sentence boundary was found.
+ StrIter_Assign(tail, word);
+
+ // Strip whitespace and punctuation that collides with an ellipsis.
+ while (STRITER_DONE != (code_point = StrIter_Prev(tail))) {
+ if (!StrHelp_is_whitespace(code_point)
+ && code_point != '.'
+ && code_point != ','
+ && code_point != ';'
+ && code_point != ':'
+ && code_point != ':'
+ && code_point != '?'
+ && code_point != '!'
+ ) {
+ StrIter_Advance(tail, 1); // Back up.
+ break;
+ }
+ ++word_offset;
+ }
+
+ *num_skipped_ptr = word_offset;
}
+ DECREF(word);
+ DECREF(iter);
return false;
}
String*
Highlighter_Raw_Excerpt_IMP(Highlighter *self, const String *field_val,
- const String *fragment, int32_t *top_ptr,
- HeatMap *heat_map, VArray *sentences) {
+ int32_t *start_ptr, HeatMap *heat_map) {
HighlighterIVARS *const ivars = Highlighter_IVARS(self);
- bool found_starting_edge = false;
- bool found_ending_edge = false;
- int32_t top = *top_ptr;
- int32_t start = top;
- int32_t end = 0;
- double field_len = Str_Length(field_val);
- uint32_t min_len = field_len < ivars->excerpt_length * 0.6666
- ? (uint32_t)field_len
- : (uint32_t)(ivars->excerpt_length * 0.6666);
-
- // Try to find a starting sentence boundary.
- const uint32_t num_sentences = VA_Get_Size(sentences);
- if (num_sentences) {
- for (uint32_t i = 0; i < num_sentences; i++) {
- Span *sentence = (Span*)VA_Fetch(sentences, i);
- int32_t candidate = Span_Get_Offset(sentence);;
-
- if (candidate > top + (int32_t)ivars->window_width) {
- break;
- }
- else if (candidate >= top) {
- // Try to start on the first sentence boundary, but only if
- // there's enough relevant material left after it in the
- // fragment.
- StackString *temp = SSTR_WRAP(fragment);
- SStr_Nip(temp, candidate - top);
- uint32_t chars_left = SStr_Truncate(temp, ivars->excerpt_length);
- if (chars_left >= min_len
- && S_has_heat(heat_map, candidate, chars_left)
- ) {
- start = candidate;
- found_starting_edge = true;
- break;
- }
- }
- }
- }
- // Try to end on a sentence boundary (but don't try very hard).
- if (num_sentences) {
- StackString *start_trimmed = SSTR_WRAP(fragment);
- SStr_Nip(start_trimmed, start - top);
+ // Find start of excerpt.
- for (uint32_t i = num_sentences; i--;) {
- Span *sentence = (Span*)VA_Fetch(sentences, i);
- int32_t last_edge = Span_Get_Offset(sentence)
- + Span_Get_Length(sentence);
+ StringIterator *top = Str_Top(field_val);
- if (last_edge <= start) {
- break;
- }
- else if (last_edge - start > (int32_t)ivars->excerpt_length) {
- continue;
- }
- else {
- uint32_t chars_left = last_edge - start;
- if (chars_left > min_len
- && S_has_heat(heat_map, start, chars_left)
- ) {
- found_ending_edge = true;
- end = last_edge;
- break;
- }
- else {
- StackString *temp = SSTR_WRAP((String*)start_trimmed);
- SStr_Nip(temp, chars_left);
- SStr_Trim_Tail(temp);
- if (SStr_Get_Size(temp) == 0) {
- // Short, but ending on a boundary already.
- found_ending_edge = true;
- end = last_edge;
- break;
- }
- }
- }
- }
+ int32_t best_location = S_hottest(heat_map);
+ int32_t start;
+ uint32_t max_skip;
+
+ if (best_location <= ivars->slop) {
+ // If the beginning of the string falls within the window centered
+ // around the hottest point in the field, start the fragment at the
+ // beginning.
+ start = 0;
+ max_skip = best_location;
}
- int32_t this_excerpt_len = found_ending_edge
- ? end - start
- : (int32_t)ivars->excerpt_length;
- if (!this_excerpt_len) {
- *top_ptr = start;
- return Str_new(0);
+ else {
+ start = best_location - ivars->slop;
+ max_skip = ivars->slop;
+ StrIter_Advance(top, start);
}
- StackString *substring = SSTR_WRAP((String*)field_val);
+ uint32_t num_skipped;
+ bool found_starting_edge
+ = S_find_starting_boundary(top, max_skip, &num_skipped);
+ start += num_skipped;
+
+ // Find end of excerpt.
- if (found_starting_edge) {
- SStr_Nip(substring, start);
- SStr_Truncate(substring, this_excerpt_len);
+ StringIterator *tail = (StringIterator*)StrIter_Clone(top);
+
+ uint32_t max_len = ivars->excerpt_length;
+ if (!found_starting_edge) {
+ // Leave space for starting ellipsis and space character.
+ max_len -= 2;
}
- // If not starting on a sentence boundary, prepend an ellipsis.
- else {
- const size_t ELLIPSIS_LEN = 2; // Unicode ellipsis plus a space.
-
- // If the excerpt is already shorter than the spec'd length, we might
- // not need to make room.
- this_excerpt_len += ELLIPSIS_LEN;
-
- // Remember original position
- int32_t orig_start = start;
- int32_t orig_len = this_excerpt_len;
-
- // Move the start back one in case the character right before the
- // excerpt starts is whitespace.
- if (start) {
- this_excerpt_len += 1;
- start -= 1;
- SStr_Nip(substring, start);
- }
- do {
- uint32_t code_point = SStr_Nibble(substring);
- start++;
- this_excerpt_len--;
+ bool found_ending_edge = true;
+ uint32_t excerpt_len = StrIter_Advance(tail, max_len);
- if (StrHelp_is_whitespace(code_point)) {
- if (!found_ending_edge) {
- // If we still need room, we'll lop it off the end since
- // we don't know a solid end point yet.
- break;
- }
- else if (this_excerpt_len <= (int32_t)ivars->excerpt_length) {
- break;
- }
- }
- } while (SStr_Get_Size(substring));
-
- if (SStr_Get_Size(substring) == 0) {
- // Word is longer than excerpt_length. Reset to original position
- // truncating the word.
- SStr_Assign(substring, (String*)field_val);
- start = orig_start;
- this_excerpt_len = orig_len;
- int32_t diff = this_excerpt_len - ivars->excerpt_length;
- if (diff > 0) {
- SStr_Nip(substring, diff);
- start += diff;
- this_excerpt_len -= diff;
- }
+ // Skip up to slop code points but keep at least max_len - slop.
+ if (excerpt_len > max_len - ivars->slop) {
+ max_skip = excerpt_len - (max_len - ivars->slop);
+ found_ending_edge
+ = S_find_ending_boundary(tail, max_skip, &num_skipped);
+ if (num_skipped >= excerpt_len) {
+ excerpt_len = 0;
+ }
+ else {
+ excerpt_len -= num_skipped;
}
-
- SStr_Truncate(substring, ivars->excerpt_length - ELLIPSIS_LEN);
}
- // If excerpt doesn't end on a sentence boundary, tack on an ellipsis.
- if (found_ending_edge) {
- SStr_Truncate(substring, end - start);
- SStr_Trim_Tail(substring);
+ // Extract excerpt.
+
+ String *raw_excerpt;
+
+ if (!excerpt_len) {
+ raw_excerpt = Str_new(0);
}
else {
- // Remember original excerpt
- StackString *orig_substring = SSTR_WRAP((String*)substring);
- // Check for prepended ellipsis
- uint32_t min_size = found_starting_edge ? 0 : 4;
-
- do {
- uint32_t code_point = SStr_Code_Point_From(substring, 1);
- SStr_Chop(substring, 1);
- if (StrHelp_is_whitespace(code_point)) {
- SStr_Trim_Tail(substring);
-
- // Strip punctuation that collides with an ellipsis.
- code_point = SStr_Code_Point_From(substring, 1);
- while (code_point == '.'
- || code_point == ','
- || code_point == ';'
- || code_point == ':'
- || code_point == ':'
- || code_point == '?'
- || code_point == '!'
- ) {
- SStr_Chop(substring, 1);
- code_point = SStr_Code_Point_From(substring, 1);
- }
+ String *substring = StrIter_substring(top, tail);
+ CharBuf *buf = CB_new(Str_Get_Size(substring) + 8);
+
+ // If not starting on a sentence boundary, prepend an ellipsis.
+ if (!found_starting_edge) {
+ CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
+ CB_Cat_Char(buf, ' ');
+ start -= 2;
+ }
- break;
- }
- } while (SStr_Get_Size(substring) > min_size);
+ CB_Cat(buf, substring);
- if (SStr_Get_Size(substring) == min_size) {
- // Word is longer than excerpt_length. Reset to original excerpt
- // truncating the word.
- SStr_Assign(substring, (String*)orig_substring);
- SStr_Chop(substring, 1);
+ // If not ending on a sentence boundary, append an ellipsis.
+ if (!found_ending_edge) {
+ CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
}
- }
- CharBuf *buf = CB_new(SStr_Get_Size(substring) + 8);
+ raw_excerpt = CB_Yield_String(buf);
- if (!found_starting_edge) {
- CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
- CB_Cat_Char(buf, ' ');
- const size_t ELLIPSIS_LEN = 2; // Unicode ellipsis plus a space.
- start -= ELLIPSIS_LEN;
+ DECREF(buf);
+ DECREF(substring);
}
- CB_Cat(buf, (String*)substring);
-
- if (!found_ending_edge) {
- CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
- }
+ *start_ptr = start;
- String *raw_excerpt = CB_Yield_String(buf);
- DECREF(buf);
- *top_ptr = start;
+ DECREF(top);
+ DECREF(tail);
return raw_excerpt;
}
@@ -591,105 +547,6 @@ Highlighter_Highlight_Excerpt_IMP(Highlighter *self, VArray *spans,
return highlighted;
}
-static Span*
-S_start_sentence(int32_t pos) {
- return Span_new(pos, 0, 0.0);
-}
-
-static void
-S_close_sentence(VArray *sentences, Span **sentence_ptr,
- int32_t sentence_end) {
- Span *sentence = *sentence_ptr;
- int32_t length = sentence_end - Span_Get_Offset(sentence);
- const int32_t MIN_SENTENCE_LENGTH = 3; // e.g. "OK.", but not "2."
- if (length >= MIN_SENTENCE_LENGTH) {
- Span_Set_Length(sentence, length);
- VA_Push(sentences, (Obj*)sentence);
- *sentence_ptr = NULL;
- }
-}
-
-VArray*
-Highlighter_Find_Sentences_IMP(Highlighter *self, String *text,
- int32_t offset, int32_t length) {
- /* When [sentence] is NULL, that means a sentence start has not yet been
- * found. When it is a Span object, we have a start, but we haven't found
- * an end. Once we find the end, we add the sentence to the [sentences]
- * array and set [sentence] back to NULL to indicate that we're looking
- * for a start once more.
- */
- Span *sentence = NULL;
- VArray *sentences = VA_new(10);
- int32_t stop = length == 0
- ? INT32_MAX
- : offset + length;
- StackString *fragment = SSTR_WRAP(text);
- int32_t pos = SStr_Trim_Top(fragment);
- UNUSED_VAR(self);
-
- /* Our first task will be to find a sentence that either starts at the top
- * of the fragment, or overlaps its start. Starting at the top of the
- * field is a special case: we define the first non-whitespace character
- * to begin a sentence, rather than look for the first character following
- * a period and whitespace. Everywhere else, we have to define sentence
- * starts based on a sentence end that has just passed by.
- */
- if (offset <= pos) {
- // Assume that first non-whitespace character begins a sentence.
- if (pos < stop && SStr_Get_Size(fragment) > 0) {
- sentence = S_start_sentence(pos);
- }
- }
- else {
- SStr_Nip(fragment, offset - pos);
- pos = offset;
- }
-
- while (1) {
- uint32_t code_point = SStr_Code_Point_At(fragment, 0);
- if (!code_point) {
- // End of fragment. If we have a sentence open, close it,
- // then bail.
- if (sentence) { S_close_sentence(sentences, &sentence, pos); }
- break;
- }
- else if (code_point == '.') {
- uint32_t whitespace_count;
- pos += SStr_Nip(fragment, 1); // advance past "."
-
- if (pos == stop && SStr_Get_Size(fragment) == 0) {
- // Period ending the field string.
- if (sentence) { S_close_sentence(sentences, &sentence, pos); }
- break;
- }
- else if (0 != (whitespace_count = SStr_Trim_Top(fragment))) {
- // We've found a period followed by whitespace. Close out the
- // existing sentence, if there is one. */
- if (sentence) { S_close_sentence(sentences, &sentence, pos); }
-
- // Advance past whitespace.
- pos += whitespace_count;
- if (pos < stop && SStr_Get_Size(fragment) > 0) {
- // Not at the end of the string? Then we've found a
- // sentence start.
- sentence = S_start_sentence(pos);
- }
- }
-
- // We may not have reached the end of the field yet, but it's
- // entirely possible that our last sentence overlapped the end of
- // the fragment -- in which case, it's time to bail.
- if (pos >= stop) { break; }
- }
- else {
- SStr_Nip(fragment, 1);
- pos++;
- }
- }
-
- return sentences;
-}
-
String*
Highlighter_Encode_IMP(Highlighter *self, String *text) {
UNUSED_VAR(self);
http://git-wip-us.apache.org/repos/asf/lucy/blob/1f51cae0/core/Lucy/Highlight/Highlighter.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Highlight/Highlighter.cfh b/core/Lucy/Highlight/Highlighter.cfh
index 79742ad..afc9a8e 100644
--- a/core/Lucy/Highlight/Highlighter.cfh
+++ b/core/Lucy/Highlight/Highlighter.cfh
@@ -29,7 +29,6 @@ public class Lucy::Highlight::Highlighter inherits Clownfish::Obj {
Query *query;
String *field;
uint32_t excerpt_length;
- uint32_t window_width;
uint32_t slop;
String *pre_tag;
String *post_tag;
@@ -67,23 +66,6 @@ public class Lucy::Highlight::Highlighter inherits Clownfish::Obj {
public incremented String*
Encode(Highlighter *self, String *text);
- /** Find sentence boundaries within the specified range, returning them as
- * an array of Spans. The "offset" of each Span indicates the start of
- * the sentence, and is measured from 0, not from <code>offset</code>.
- * The Span's "length" member indicates the sentence length in code
- * points.
- *
- * @param text The string to scan.
- * @param offset The place to start looking for offsets, measured in
- * Unicode code points from the top of <code>text</code>.
- * @param length The number of code points from <code>offset</code> to
- * scan. The default value of 0 is a sentinel which indicates to scan
- * until the end of the string.
- */
- incremented VArray*
- Find_Sentences(Highlighter *self, String *text, int32_t offset = 0,
- int32_t length = 0);
-
/** Highlight a small section of text. By default, prepends pre-tag and
* appends post-tag. This method is called internally by Create_Excerpt()
* when assembling an excerpt.
@@ -138,25 +120,15 @@ public class Lucy::Highlight::Highlighter inherits Clownfish::Obj {
Get_Compiler(Highlighter *self);
/** Decide based on heat map the best fragment of field to concentrate on.
- * Place the result into <code>fragment<code> and return its offset in
- * code points from the top of the field.
- *
- * (Helper function for Create_Excerpt only exposed for testing purposes.)
- */
- int32_t
- Find_Best_Fragment(Highlighter *self, const String *field_val,
- ViewCharBuf *fragment, HeatMap *heat_map);
-
- /** Take the fragment and determine the best edges for it based on
+ * Take the fragment and determine the best edges for it based on
* sentence boundaries when possible. Add ellipses when boundaries cannot
* be found.
*
* (Helper function for Create_Excerpt only exposed for testing purposes.)
*/
String*
- Raw_Excerpt(Highlighter *self, const String *field_val,
- const String *fragment, int32_t *top, HeatMap *heat_map,
- VArray *sentences);
+ Raw_Excerpt(Highlighter *self, const String *field_value, int32_t *top,
+ HeatMap *heat_map);
/** Take the text in raw_excerpt, add highlight tags, encode, and place
* the result into <code>highlighted</code>.
http://git-wip-us.apache.org/repos/asf/lucy/blob/1f51cae0/core/Lucy/Test/Highlight/TestHighlighter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Highlight/TestHighlighter.c b/core/Lucy/Test/Highlight/TestHighlighter.c
index 111d655..f8fabc6 100644
--- a/core/Lucy/Test/Highlight/TestHighlighter.c
+++ b/core/Lucy/Test/Highlight/TestHighlighter.c
@@ -58,69 +58,6 @@ TestHighlighter_new() {
}
static void
-test_Find_Best_Fragment(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
- String *content = (String*)SSTR_WRAP_STR("content", 7);
- Highlighter *highlighter = Highlighter_new(searcher, query, content, 3);
- ViewCharBuf *target = (ViewCharBuf*)SStr_BLANK();
-
- VArray *spans = VA_new(1);
- VA_Push(spans, (Obj*)Span_new(2, 1, 1.0f));
- HeatMap *heat_map = HeatMap_new(spans, 133);
- DECREF(spans);
- String *field_val = (String *)SSTR_WRAP_STR("a " PHI " " PHI " b c", 11);
- int32_t top = Highlighter_Find_Best_Fragment(highlighter, field_val,
- target, heat_map);
- TEST_TRUE(runner,
- Str_Equals_Str((String *)target, PHI " " PHI " b", 7),
- "Find_Best_Fragment");
- TEST_TRUE(runner,
- top == 2,
- "correct offset returned by Find_Best_Fragment");
- field_val = (String *)SSTR_WRAP_STR("aa" PHI, 4);
- top = Highlighter_Find_Best_Fragment(highlighter, field_val,
- target, heat_map);
- TEST_TRUE(runner,
- Str_Equals_Str((String *)target, "aa" PHI, 4),
- "Find_Best_Fragment returns whole field when field is short");
- TEST_TRUE(runner,
- top == 0,
- "correct offset");
- DECREF(heat_map);
-
- spans = VA_new(1);
- VA_Push(spans, (Obj*)Span_new(6, 2, 1.0f));
- heat_map = HeatMap_new(spans, 133);
- DECREF(spans);
- field_val = (String *)SSTR_WRAP_STR("aaaab" PHI PHI, 9);
- top = Highlighter_Find_Best_Fragment(highlighter, field_val,
- target, heat_map);
- TEST_TRUE(runner,
- Str_Equals_Str((String *)target, "b" PHI PHI, 5),
- "Find_Best_Fragment shifts left to deal with overrun");
- TEST_TRUE(runner,
- top == 4,
- "correct offset");
- DECREF(heat_map);
-
- spans = VA_new(1);
- VA_Push(spans, (Obj*)Span_new(0, 1, 1.0f));
- heat_map = HeatMap_new(spans, 133);
- DECREF(spans);
- field_val = (String *)SSTR_WRAP_STR("a" PHI "bcde", 7);
- top = Highlighter_Find_Best_Fragment(highlighter, field_val,
- target, heat_map);
- TEST_TRUE(runner,
- Str_Equals_Str((String *)target, "a" PHI "bcd", 6),
- "Find_Best_Fragment start at field beginning");
- TEST_TRUE(runner,
- top == 0,
- "correct offset");
- DECREF(heat_map);
-
- DECREF(highlighter);
-}
-
-static void
test_Raw_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
String *content = (String*)SSTR_WRAP_STR("content", 7);
Highlighter *highlighter = Highlighter_new(searcher, query, content, 6);
@@ -128,119 +65,96 @@ test_Raw_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
String *raw_excerpt;
String *field_val = (String *)SSTR_WRAP_STR("Ook. Urk. Ick. ", 18);
- String *fragment = (String *)SSTR_WRAP_STR("Ook. Urk.", 10);
VArray *spans = VA_new(1);
VA_Push(spans, (Obj*)Span_new(0, 18, 1.0f));
HeatMap *heat_map = HeatMap_new(spans, 133);
DECREF(spans);
- VArray *sentences = VA_new(2);
- VA_Push(sentences, (Obj*)Span_new(0, 4, 0.0f));
- VA_Push(sentences, (Obj*)Span_new(6, 4, 0.0f));
- top = 0;
- raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
- &top, heat_map, sentences);
+ raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+ heat_map);
TEST_TRUE(runner,
Str_Equals_Str(raw_excerpt, "Ook.", 4),
- "Raw_Excerpt at top");
+ "Raw_Excerpt at top %s", Str_Get_Ptr8(raw_excerpt));
TEST_TRUE(runner,
top == 0,
- "top still 0");
- DECREF(sentences);
+ "top is 0");
DECREF(raw_excerpt);
+ DECREF(heat_map);
- fragment = (String *)SSTR_WRAP_STR(". Urk. I", 10);
- sentences = VA_new(2);
- VA_Push(sentences, (Obj*)Span_new(6, 4, 0.0f));
- VA_Push(sentences, (Obj*)Span_new(12, 4, 0.0f));
- top = 3;
- raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
- &top, heat_map, sentences);
+ spans = VA_new(1);
+ VA_Push(spans, (Obj*)Span_new(6, 12, 1.0f));
+ heat_map = HeatMap_new(spans, 133);
+ DECREF(spans);
+ raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+ heat_map);
TEST_TRUE(runner,
Str_Equals_Str(raw_excerpt, "Urk.", 4),
"Raw_Excerpt in middle, with 2 bounds");
TEST_TRUE(runner,
top == 6,
"top in the middle modified by Raw_Excerpt");
- DECREF(sentences);
- DECREF(heat_map);
DECREF(raw_excerpt);
+ DECREF(heat_map);
- field_val = (String *)SSTR_WRAP_STR("Ook urk ick i.", 14);
- fragment = (String *)SSTR_WRAP_STR("ick i.", 6);
- spans = VA_new(1);
- VA_Push(spans, (Obj*)Span_new(0, 14, 1.0f));
+ field_val = (String *)SSTR_WRAP_STR("Ook urk ick i.", 14);
+ spans = VA_new(1);
+ VA_Push(spans, (Obj*)Span_new(12, 1, 1.0f));
heat_map = HeatMap_new(spans, 133);
DECREF(spans);
- sentences = VA_new(1);
- VA_Push(sentences, (Obj*)Span_new(0, 14, 0.0f));
- top = 8;
- raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
- &top, heat_map, sentences);
+ raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+ heat_map);
TEST_TRUE(runner,
Str_Equals_Str(raw_excerpt, ELLIPSIS " i.", 6),
"Ellipsis at top");
TEST_TRUE(runner,
top == 10,
"top correct when leading ellipsis inserted");
- DECREF(sentences);
DECREF(heat_map);
DECREF(raw_excerpt);
- field_val = (String *)SSTR_WRAP_STR("Urk. Iz no good.", 17);
- fragment = (String *)SSTR_WRAP_STR(" Iz no go", 10);
- spans = VA_new(1);
- VA_Push(spans, (Obj*)Span_new(0, 17, 1.0f));
+ field_val = (String *)SSTR_WRAP_STR("Urk. Iz no good.", 17);
+ spans = VA_new(1);
+ VA_Push(spans, (Obj*)Span_new(6, 2, 1.0f));
heat_map = HeatMap_new(spans, 133);
DECREF(spans);
- sentences = VA_new(1);
- VA_Push(sentences, (Obj*)Span_new(6, 11, 0.0f));
- top = 4;
- raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
- &top, heat_map, sentences);
+ raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+ heat_map);
TEST_TRUE(runner,
Str_Equals_Str(raw_excerpt, "Iz no" ELLIPSIS, 8),
"Ellipsis at end");
TEST_TRUE(runner,
top == 6,
"top trimmed");
- DECREF(sentences);
DECREF(heat_map);
DECREF(raw_excerpt);
// Words longer than excerpt len
- field_val = (String *)SSTR_WRAP_STR("abc/def/ghi/jkl/mno", 19);
- sentences = VA_new(1);
- VA_Push(sentences, (Obj*)Span_new(0, 19, 0.0f));
+ field_val = (String *)SSTR_WRAP_STR("abc/def/ghi/jkl/mno", 19);
- spans = VA_new(1);
+ spans = VA_new(1);
VA_Push(spans, (Obj*)Span_new(0, 3, 1.0f));
heat_map = HeatMap_new(spans, 133);
DECREF(spans);
- top = 0;
- raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, field_val,
- &top, heat_map, sentences);
+ raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+ heat_map);
TEST_TRUE(runner,
Str_Equals_Str(raw_excerpt, "abc/d" ELLIPSIS, 8),
- "Long word");
+ "Long word at top %s");
DECREF(heat_map);
DECREF(raw_excerpt);
- spans = VA_new(1);
+ spans = VA_new(1);
VA_Push(spans, (Obj*)Span_new(8, 3, 1.0f));
heat_map = HeatMap_new(spans, 133);
DECREF(spans);
- top = 0;
- raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, field_val,
- &top, heat_map, sentences);
+ raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+ heat_map);
TEST_TRUE(runner,
- Str_Equals_Str(raw_excerpt, ELLIPSIS " c/d" ELLIPSIS, 10),
- "Long word");
+ Str_Equals_Str(raw_excerpt, ELLIPSIS " f/g" ELLIPSIS, 10),
+ "Long word in middle");
DECREF(heat_map);
DECREF(raw_excerpt);
- DECREF(sentences);
-
DECREF(highlighter);
}
@@ -407,58 +321,6 @@ test_Create_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query,
}
static void
-test_Find_Sentences(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
- String *content = (String*)SSTR_WRAP_STR("content", 7);
- Highlighter *highlighter = Highlighter_new(searcher, query, content, 200);
- String *text = (String*)SSTR_WRAP_STR(
- "This is a sentence. This is a sentence. This is a sentence. "
- "This is a sentence. This is a sentence. This is a sentence. "
- "This is a sentence. This is a sentence. This is a sentence. "
- "This is a sentence. This is a sentence. This is a sentence. "
- "This is a sentence. This is a sentence. This is a sentence. ",
- 300);
-
- VArray *got = Highlighter_Find_Sentences(highlighter, text, 101, 50);
- VArray *wanted = VA_new(2);
- VA_Push(wanted, (Obj*)Span_new(120, 19, 0.0f));
- VA_Push(wanted, (Obj*)Span_new(140, 19, 0.0f));
- TEST_TRUE(runner,
- VA_Equals(got, (Obj*)wanted),
- "find_sentences with explicit args");
- DECREF(wanted);
- DECREF(got);
-
- got = Highlighter_Find_Sentences(highlighter, text, 101, 4);
- TEST_TRUE(runner,
- VA_Get_Size(got) == 0,
- "find_sentences with explicit args, finding nothing");
- DECREF(got);
-
- got = Highlighter_Find_Sentences(highlighter, text, 0, 0);
- wanted = VA_new(15);
- for (int i = 0; i < 15; ++i) {
- VA_Push(wanted, (Obj*)Span_new(i * 20, 19, 0.0f));
- }
- TEST_TRUE(runner,
- VA_Equals(got, (Obj*)wanted),
- "find_sentences with default offset and length");
- DECREF(wanted);
- DECREF(got);
-
- text = (String*)SSTR_WRAP_STR(" Foo", 4);
- got = Highlighter_Find_Sentences(highlighter, text, 0, 0);
- wanted = VA_new(1);
- VA_Push(wanted, (Obj*)Span_new(1, 3, 0.0f));
- TEST_TRUE(runner,
- VA_Equals(got, (Obj*)wanted),
- "Skip leading whitespace but get first sentence");
- DECREF(wanted);
- DECREF(got);
-
- DECREF(highlighter);
-}
-
-static void
test_highlighting(TestBatchRunner *runner) {
Schema *schema = Schema_new();
StandardTokenizer *tokenizer = StandardTokenizer_new();
@@ -507,11 +369,9 @@ test_highlighting(TestBatchRunner *runner) {
Obj *query = (Obj*)SSTR_WRAP_STR("\"x y z\" AND " PHI, 14);
Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL);
- test_Find_Best_Fragment(runner, searcher, query);
test_Raw_Excerpt(runner, searcher, query);
test_Highlight_Excerpt(runner, searcher, query);
test_Create_Excerpt(runner, searcher, query, hits);
- test_Find_Sentences(runner, searcher, query);
DECREF(hits);
DECREF(searcher);
@@ -578,7 +438,7 @@ test_hl_selection(TestBatchRunner *runner) {
void
TestHighlighter_Run_IMP(TestHighlighter *self, TestBatchRunner *runner) {
- TestBatchRunner_Plan(runner, (TestBatch*)self, 35);
+ TestBatchRunner_Plan(runner, (TestBatch*)self, 23);
test_highlighting(runner);
test_hl_selection(runner);
}