You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ko...@apache.org on 2012/06/12 15:59:38 UTC
svn commit: r1349361 - in /lucene/dev/trunk/lucene: ./
highlighter/src/java/org/apache/lucene/search/vectorhighlight/
highlighter/src/test/org/apache/lucene/search/vectorhighlight/
Author: koji
Date: Tue Jun 12 13:59:37 2012
New Revision: 1349361
URL: http://svn.apache.org/viewvc?rev=1349361&view=rev
Log:
LUCENE-4133: FVH: A weighted approach for ordered fragments, part of LUCENE-3440
Added:
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1349361&r1=1349360&r2=1349361&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Jun 12 13:59:37 2012
@@ -904,6 +904,9 @@ New features
cause a ParseException (depending on whether strict parsing is enabled).
(Luca Cavanna via Chris Male)
+* LUCENE-3440: Add ordered fragments feature with IDF-weighted terms for FVH.
+ (Sebastian Lutze via Koji Sekiguchi)
+
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1349361&r1=1349360&r2=1349361&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Tue Jun 12 13:59:37 2012
@@ -150,7 +150,7 @@ public class FieldPhraseList {
}
/**
- * @return the termInfos
+ * @return the termInfos
*/
public List<TermInfo> getTermsInfos() {
return termsInfos;
@@ -164,7 +164,7 @@ public class FieldPhraseList {
this.boost = boost;
this.seqnum = seqnum;
- // now we keep TermInfos for further operations
+ // We keep TermInfos for further operations
termsInfos = new ArrayList<TermInfo>( terms );
termsOffsets = new ArrayList<Toffs>( terms.size() );
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java?rev=1349361&r1=1349360&r2=1349361&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java Tue Jun 12 13:59:37 2012
@@ -42,12 +42,13 @@ public class SimpleFieldFragList extends
*/
@Override
public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
- float score = 0;
+ float totalBoost = 0;
List<SubInfo> subInfos = new ArrayList<SubInfo>();
for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
- score += phraseInfo.getBoost();
+ totalBoost += phraseInfo.getBoost();
}
- getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, score ) );
+ getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
}
+
}
Added: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java?rev=1349361&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java Tue Jun 12 13:59:37 2012
@@ -0,0 +1,76 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+
+/**
+ * A weighted implementation of {@link FieldFragList}.
+ */
+public class WeightedFieldFragList extends FieldFragList {
+
+ /**
+ * a constructor.
+ *
+ * @param fragCharSize the length (number of chars) of a fragment
+ */
+ public WeightedFieldFragList( int fragCharSize ) {
+ super( fragCharSize );
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList )
+ */
+ @Override
+ public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
+
+ float totalBoost = 0;
+
+ List<SubInfo> subInfos = new ArrayList<SubInfo>();
+
+ HashSet<String> distinctTerms = new HashSet<String>();
+
+ int length = 0;
+
+ for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+
+ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
+
+ for ( TermInfo ti : phraseInfo.getTermsInfos()) {
+ if ( distinctTerms.add( ti.getText() ) )
+ totalBoost += ti.getWeight() * phraseInfo.getBoost();
+ length++;
+ }
+ }
+
+ // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query
+ // would cause an equal weight for all fragments regardless of how much words they contain.
+ // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments
+ // we "bend" the length with a standard-normalization a little bit.
+ totalBoost *= length * ( 1 / Math.sqrt( length ) );
+
+ getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
+ }
+
+}
\ No newline at end of file
Added: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java?rev=1349361&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java Tue Jun 12 13:59:37 2012
@@ -0,0 +1,41 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A weighted implementation of {@link FragListBuilder}.
+ */
+public class WeightedFragListBuilder extends BaseFragListBuilder {
+
+ public WeightedFragListBuilder() {
+ super();
+ }
+
+ public WeightedFragListBuilder(int margin) {
+ super(margin);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize)
+ */
+ @Override
+ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
+ return createFieldFragList( fieldPhraseList, new WeightedFieldFragList( fragCharSize ), fragCharSize );
+ }
+
+}
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html?rev=1349361&r1=1349360&r2=1349361&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html Tue Jun 12 13:59:37 2012
@@ -27,9 +27,9 @@ This is an another highlighter implement
<li>support multi-term (includes wildcard, range, regexp, etc) queries</li>
<li>need Java 1.5</li>
<li>highlight fields need to be stored with Positions and Offsets</li>
-<li>take into account query boost to score fragments</li>
+<li>take into account query boost and/or IDF-weight to score fragments</li>
<li>support colored highlight tags</li>
-<li>pluggable FragListBuilder</li>
+<li>pluggable FragListBuilder / FieldFragList</li>
<li>pluggable FragmentsBuilder</li>
</ul>
@@ -122,9 +122,8 @@ by reference to <code>QueryPhraseMap</co
+----------------+-----------------+---+
</pre>
<p>The type of each entry is <code>WeightedPhraseInfo</code> that consists of
-an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
-calculate the weight) will be taken into account when Fast Vector Highlighter creates
-{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.</p>
+an array of terms offsets and weight.
+</p>
<h3>Step 4.</h3>
<p>In Step 4, Fast Vector Highlighter creates <code>FieldFragList</code> by reference to
<code>FieldPhraseList</code>. In this sample case, the following
@@ -137,6 +136,59 @@ calculate the weight) will be taken into
|totalBoost=3 |
+---------------------------------+
</pre>
+
+<p>
+The calculation for each <code>FieldFragList.WeightedFragInfo.totalBoost</code> (weight)
+depends on the implementation of <code>FieldFragList.add( ... )</code>:
+<pre class="prettyprint">
+ public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
+ float totalBoost = 0;
+ List<SubInfo> subInfos = new ArrayList<SubInfo>();
+ for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
+ totalBoost += phraseInfo.getBoost();
+ }
+ getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
+ }
+
+</pre>
+The used implementation of <code>FieldFragList</code> is noted in <code>BaseFragListBuilder.createFieldFragList( ... )</code>:
+<pre class="prettyprint">
+ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
+ return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize );
+ }
+</pre>
+<p>
+Currently there are basically to approaches available:
+</p>
+<ul>
+<li><code>SimpleFragListBuilder using SimpleFieldFragList</code>: <i>sum-of-boosts</i>-approach. The totalBoost is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0</li>
+<li><code>WeightedFragListBuilder using WeightedFieldFragList</code>: <i>sum-of-distinct-weights</i>-approach. The totalBoost is calculated by summarizing the IDF-weights of distinct terms.</li>
+</ul>
+<p>Comparison of the two approaches:</p>
+<table border="1">
+<caption>
+ query = das alte testament (The Old Testament)
+</caption>
+<tr><th>Terms in fragment</th><th>sum-of-distinct-weights</th><th>sum-of-boosts</th></tr>
+<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das testament alte</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das testament</td><td>2.9455688</td><td>2.0</td></tr>
+<tr><td>das alte</td><td>2.4759595</td><td>2.0</td></tr>
+<tr><td>das das das das</td><td>1.5015357</td><td>4.0</td></tr>
+<tr><td>das das das</td><td>1.3003681</td><td>3.0</td></tr>
+<tr><td>das das</td><td>1.061746</td><td>2.0</td></tr>
+<tr><td>alte</td><td>1.0</td><td>1.0</td></tr>
+<tr><td>alte</td><td>1.0</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+</table>
+
<h3>Step 5.</h3>
<p>In Step 5, by using <code>FieldFragList</code> and the field stored data,
Fast Vector Highlighter creates highlighted snippets!</p>
Added: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java?rev=1349361&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java Tue Jun 12 13:59:37 2012
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class WeightedFragListBuilderTest extends AbstractTestCase {
+
+ public void test2WeightedFragList() throws Exception {
+
+ makeIndexLongMV();
+
+ FieldQuery fq = new FieldQuery( pqF( "the", "both" ), true, true );
+ FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
+ FieldPhraseList fpl = new FieldPhraseList( stack, fq );
+ WeightedFragListBuilder wflb = new WeightedFragListBuilder();
+ FieldFragList ffl = wflb.createFieldFragList( fpl, 100 );
+ assertEquals( 1, ffl.getFragInfos().size() );
+ assertEquals( "subInfos=(theboth((195,203)))/0.86791086(189,289)", ffl.getFragInfos().get( 0 ).toString() );
+ }
+
+}