You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/07/09 15:06:52 UTC
svn commit: r792542 [1/3] - in /lucene/java/trunk: ./ contrib/
contrib/fast-vector-highlighter/ contrib/fast-vector-highlighter/src/
contrib/fast-vector-highlighter/src/java/
contrib/fast-vector-highlighter/src/java/org/
contrib/fast-vector-highlighter...
Author: mikemccand
Date: Thu Jul 9 13:06:51 2009
New Revision: 792542
URL: http://svn.apache.org/viewvc?rev=792542&view=rev
Log:
LUCENE-1522: adding new Fast Vector Highlighter contrib
Added:
lucene/java/trunk/contrib/fast-vector-highlighter/
lucene/java/trunk/contrib/fast-vector-highlighter/build.xml (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java (with props)
lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (with props)
Modified:
lucene/java/trunk/build.xml
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/docs/benchmarks.html
lucene/java/trunk/docs/broken-links.xml
lucene/java/trunk/docs/contributions.html
lucene/java/trunk/docs/demo.html
lucene/java/trunk/docs/demo2.html
lucene/java/trunk/docs/demo3.html
lucene/java/trunk/docs/demo4.html
lucene/java/trunk/docs/fileformats.html
lucene/java/trunk/docs/gettingstarted.html
lucene/java/trunk/docs/index.html
lucene/java/trunk/docs/linkmap.html
lucene/java/trunk/docs/linkmap.pdf
lucene/java/trunk/docs/lucene-sandbox/index.html
lucene/java/trunk/docs/lucene-sandbox/index.pdf
lucene/java/trunk/docs/queryparsersyntax.html
lucene/java/trunk/docs/scoring.html
lucene/java/trunk/src/site/src/documentation/content/xdocs/lucene-sandbox/index.xml
lucene/java/trunk/src/site/src/documentation/content/xdocs/site.xml
Modified: lucene/java/trunk/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/build.xml?rev=792542&r1=792541&r2=792542&view=diff
==============================================================================
--- lucene/java/trunk/build.xml (original)
+++ lucene/java/trunk/build.xml Thu Jul 9 13:06:51 2009
@@ -312,6 +312,7 @@
<packageset dir="contrib/collation/src/java"/>
<packageset dir="contrib/db/bdb-je/src/java"/>
<packageset dir="contrib/db/bdb/src/java"/>
+ <packageset dir="contrib/fast-vector-highlighter/src/java"/>
<packageset dir="contrib/highlighter/src/java"/>
<packageset dir="contrib/instantiated/src/java"/>
<packageset dir="contrib/lucli/src/java"/>
@@ -343,6 +344,7 @@
<group title="contrib: Benchmark" packages="org.apache.lucene.benchmark*"/>
<group title="contrib: Collation" packages="org.apache.lucene.collation*"/>
<group title="contrib: DB" packages="org.apache.lucene.store.db*:org.apache.lucene.store.je*:com.sleepycat*"/>
+ <group title="contrib: Fast Vector Highlighter" packages="org.apache.lucene.search.vectorhighlight*"/>
<group title="contrib: Highlighter" packages="org.apache.lucene.search.highlight*"/>
<group title="contrib: Instantiated" packages="org.apache.lucene.store.instantiated*"/>
<group title="contrib: Lucli" packages="lucli*"/>
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=792542&r1=792541&r2=792542&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Jul 9 13:06:51 2009
@@ -65,6 +65,9 @@
7. LUCENE-1704: Allow specifying the Tidy configuration file when
parsing HTML docs with contrib/ant. (Keith Sprochi via Mike
McCandless)
+
+ 8. LUCENE-1522: Added contrib/fast-vector-highlighter, a new alternative
+ highlighter. (Koji Sekiguchi via Mike McCandless)
Optimizations
Added: lucene/java/trunk/contrib/fast-vector-highlighter/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/build.xml?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/build.xml (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/build.xml Thu Jul 9 13:06:51 2009
@@ -0,0 +1,47 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="fast-vector-highlighter" default="default">
+
+ <description>
+ Hits highlighter using TermVectors
+ </description>
+
+ <property name="javac.source" value="1.5" />
+ <property name="javac.target" value="1.5" />
+
+ <import file="../contrib-build.xml"/>
+
+ <property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/lucene-analyzers-${version}.jar"/>
+ <available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/>
+
+ <path id="classpath">
+ <pathelement path="${lucene.jar}"/>
+ <pathelement path="${analyzers.jar}"/>
+ <pathelement path="${project.classpath}"/>
+ </path>
+
+ <target name="compile-core" depends="build-analyzers, common.compile-core" />
+
+ <target name="build-analyzers" unless="analyzers.jar.present">
+ <echo>Fast Vector Highlighter building dependency ${analyzers.jar}</echo>
+ <ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
+ </target>
+
+</project>
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,124 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.MapFieldSelector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
+
+public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
+
+ protected String[] preTags, postTags;
+ public static final String[] COLORED_PRE_TAGS = {
+ "<b style=\"background:yellow\">", "<b style=\"background:lawngreen\">", "<b style=\"background:aquamarine\">",
+ "<b style=\"background:magenta\">", "<b style=\"background:palegreen\">", "<b style=\"background:coral\">",
+ "<b style=\"background:wheat\">", "<b style=\"background:khaki\">", "<b style=\"background:lime\">",
+ "<b style=\"background:deepskyblue\">"
+ };
+ public static final String[] COLORED_POST_TAGS = { "</b>" };
+
+ protected BaseFragmentsBuilder(){
+ this( new String[]{ "<b>" }, new String[]{ "</b>" } );
+ }
+
+ protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
+ this.preTags = preTags;
+ this.postTags = postTags;
+ }
+
+ static Object checkTagsArgument( Object tags ){
+ if( tags instanceof String ) return tags;
+ else if( tags instanceof String[] ) return tags;
+ throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
+ }
+
+ public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src );
+
+ public String createFragment( IndexReader reader, int docId,
+ String fieldName, FieldFragList fieldFragList ) throws IOException {
+ String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1 );
+ if( fragments == null || fragments.length == 0 ) return null;
+ return fragments[0];
+ }
+
+ public String[] createFragments( IndexReader reader, int docId,
+ String fieldName, FieldFragList fieldFragList, int maxNumFragments )
+ throws IOException {
+ if( maxNumFragments < 0 )
+ throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
+
+ List<WeightedFragInfo> fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos );
+
+ List<String> fragments = new ArrayList<String>( maxNumFragments );
+ String[] values = getFieldValues( reader, docId, fieldName );
+ StringBuilder buffer = new StringBuilder();
+ int[] nextValueIndex = { 0 };
+ for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
+ WeightedFragInfo fragInfo = fragInfos.get( n );
+ fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo ) );
+ }
+ return fragments.toArray( new String[fragments.size()] );
+ }
+
+ protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException {
+ Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
+ return doc.getValues( fieldName );
+ }
+
+ protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){
+ StringBuilder fragment = new StringBuilder();
+ final int s = fragInfo.startOffset;
+ String src = getFragmentSource( buffer, index, values, s, fragInfo.endOffset );
+ int srcIndex = 0;
+ for( SubInfo subInfo : fragInfo.subInfos ){
+ for( Toffs to : subInfo.termsOffsets ){
+ fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( getPreTag( subInfo.seqnum ) )
+ .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( getPostTag( subInfo.seqnum ) );
+ srcIndex = to.endOffset - s;
+ }
+ }
+ fragment.append( src.substring( srcIndex ) );
+ return fragment.toString();
+ }
+
+ protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values,
+ int startOffset, int endOffset ){
+ while( buffer.length() < endOffset && index[0] < values.length ){
+ if( index[0] > 0 && values[index[0]].length() > 0 )
+ buffer.append( ' ' );
+ buffer.append( values[index[0]++] );
+ }
+ int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
+ return buffer.substring( startOffset, eo );
+ }
+
+ protected String getPreTag( int num ){
+ return preTags.length > num ? preTags[num] : preTags[0];
+ }
+
+ protected String getPostTag( int num ){
+ return postTags.length > num ? postTags[num] : postTags[0];
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,137 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Query;
+
+/**
+ * Another highlighter implementation.
+ *
+ */
+public class FastVectorHighlighter {
+
+ public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
+ public static final boolean DEFAULT_FIELD_MATCH = true;
+ private final boolean phraseHighlight;
+ private final boolean fieldMatch;
+ private final FragListBuilder fragListBuilder;
+ private final FragmentsBuilder fragmentsBuilder;
+
+ /**
+ * the default constructor.
+ */
+ public FastVectorHighlighter(){
+ this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH );
+ }
+
+ /**
+ * a constructor. Using SimpleFragListBuilder and ScoreOrderFragmentsBuilder.
+ *
+ * @param phraseHighlight true or false for phrase highlighting
+ * @param fieldMatch true of false for field matching
+ */
+ public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){
+ this( phraseHighlight, fieldMatch, new SimpleFragListBuilder(), new ScoreOrderFragmentsBuilder() );
+ }
+
+ /**
+ * a constructor. A FragListBuilder and a FragmentsBuilder can be specified (plugins).
+ *
+ * @param phraseHighlight true of false for phrase highlighting
+ * @param fieldMatch true of false for field matching
+ * @param fragListBuilder an instance of FragListBuilder
+ * @param fragmentsBuilder an instance of FragmentsBuilder
+ */
+ public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch,
+ FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder ){
+ this.phraseHighlight = phraseHighlight;
+ this.fieldMatch = fieldMatch;
+ this.fragListBuilder = fragListBuilder;
+ this.fragmentsBuilder = fragmentsBuilder;
+ }
+
+ /**
+ * create a FieldQuery object.
+ *
+ * @param query a query
+ * @return the created FieldQuery object
+ */
+ public FieldQuery getFieldQuery( Query query ){
+ return new FieldQuery( query, phraseHighlight, fieldMatch );
+ }
+
+ /**
+ * return the best fragment.
+ *
+ * @param fieldQuery FieldQuery object
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @return the best fragment (snippet) string
+ * @throws IOException
+ */
+ public final String getBestFragment( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String fieldName, int fragCharSize ) throws IOException {
+ FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize );
+ return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList );
+ }
+
+ /**
+ * return the best fragments.
+ *
+ * @param fieldQuery FieldQuery object
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @param maxNumFragments maximum number of fragments
+ * @return created fragments or null when no fragments created.
+ * size of the array can be less than maxNumFragments
+ * @throws IOException
+ */
+ public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String fieldName, int fragCharSize, int maxNumFragments ) throws IOException {
+ FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize );
+ return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments );
+ }
+
+ private FieldFragList getFieldFragList( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String fieldName, int fragCharSize ) throws IOException {
+ FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery );
+ FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery );
+ return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
+ }
+
+ /**
+ * return whether phraseHighlight or not.
+ *
+ * @return
+ */
+ public boolean isPhraseHighlight(){ return phraseHighlight; }
+
+ /**
+ * return whether fieldMatch or not.
+ *
+ * @return
+ */
+ public boolean isFieldMatch(){ return fieldMatch; }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,103 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
+
+/**
+ * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
+ * to create fragments (snippets).
+ */
+public class FieldFragList {
+
+ private final int fragCharSize;
+ List<WeightedFragInfo> fragInfos = new ArrayList<WeightedFragInfo>();
+
+ /**
+ * a constructor.
+ *
+ * @param fragCharSize the length (number of chars) of a fragment
+ */
+ public FieldFragList( int fragCharSize ){
+ this.fragCharSize = fragCharSize;
+ }
+
+ /**
+ * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos
+ *
+ * @param startOffset start offset of the fragment
+ * @param endOffset end offset of the fragment
+ * @param phraseInfoList list of WeightedPhraseInfo objects
+ */
+ public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ){
+ fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) );
+ }
+
+ public static class WeightedFragInfo {
+
+ List<SubInfo> subInfos;
+ float totalBoost;
+ int startOffset;
+ int endOffset;
+
+ public WeightedFragInfo( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ){
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ subInfos = new ArrayList<SubInfo>();
+ for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+ SubInfo subInfo = new SubInfo( phraseInfo.text, phraseInfo.termsOffsets, phraseInfo.seqnum );
+ subInfos.add( subInfo );
+ totalBoost += phraseInfo.boost;
+ }
+ }
+
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( "subInfos=(" );
+ for( SubInfo si : subInfos )
+ sb.append( si.toString() );
+ sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
+ return sb.toString();
+ }
+
+ static class SubInfo {
+ final String text; // unnecessary member, just exists for debugging purpose
+ final List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
+ // but if position-gap > 1 and slop > 0 then size() could be greater than 1
+ int seqnum;
+ SubInfo( String text, List<Toffs> termsOffsets, int seqnum ){
+ this.text = text;
+ this.termsOffsets = termsOffsets;
+ this.seqnum = seqnum;
+ }
+
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( text ).append( '(' );
+ for( Toffs to : termsOffsets )
+ sb.append( to.toString() );
+ sb.append( ')' );
+ return sb.toString();
+ }
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,183 @@
+package org.apache.lucene.search.vectorhighlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+
+/**
+ * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
+ * to create a FieldFragList object.
+ */
+public class FieldPhraseList {
+
+ LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<WeightedPhraseInfo>();
+
+ /**
+ * a constructor.
+ *
+ * @param fieldTermStack FieldTermStack object
+ * @param fieldQuery FieldQuery object
+ */
+ public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){
+ final String field = fieldTermStack.getFieldName();
+
+ LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>();
+ QueryPhraseMap currMap = null;
+ QueryPhraseMap nextMap = null;
+ while( !fieldTermStack.isEmpty() ){
+
+ phraseCandidate.clear();
+
+ TermInfo ti = fieldTermStack.pop();
+ currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+
+ // if not found, discard top TermInfo from stack, then try next element
+ if( currMap == null ) continue;
+
+ // if found, search the longest phrase
+ phraseCandidate.add( ti );
+ while( true ){
+ ti = fieldTermStack.pop();
+ nextMap = null;
+ if( ti != null )
+ nextMap = currMap.getTermMap( ti.getText() );
+ if( ti == null || nextMap == null ){
+ if( ti != null )
+ fieldTermStack.push( ti );
+ if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ }
+ else{
+ while( phraseCandidate.size() > 1 ){
+ fieldTermStack.push( phraseCandidate.removeLast() );
+ currMap = fieldQuery.searchPhrase( field, phraseCandidate );
+ if( currMap != null ){
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ break;
+ }
+ }
+ }
+ break;
+ }
+ else{
+ phraseCandidate.add( ti );
+ currMap = nextMap;
+ }
+ }
+ }
+ }
+
+ void addIfNoOverlap( WeightedPhraseInfo wpi ){
+ for( WeightedPhraseInfo existWpi : phraseList ){
+ if( existWpi.isOffsetOverlap( wpi ) ) return;
+ }
+ phraseList.add( wpi );
+ }
+
+ public static class WeightedPhraseInfo {
+
+ String text; // unnecessary member, just exists for debugging purpose
+ List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
+ // but if position-gap > 1 and slop > 0 then size() could be greater than 1
+ float boost; // query boost
+ int seqnum;
+
+ public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost ){
+ this( terms, boost, 0 );
+ }
+
+ public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int number ){
+ this.boost = boost;
+ this.seqnum = number;
+ termsOffsets = new ArrayList<Toffs>( terms.size() );
+ TermInfo ti = terms.get( 0 );
+ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
+ if( terms.size() == 1 ){
+ text = ti.getText();
+ return;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append( ti.getText() );
+ int pos = ti.getPosition();
+ for( int i = 1; i < terms.size(); i++ ){
+ ti = terms.get( i );
+ sb.append( ti.getText() );
+ if( ti.getPosition() - pos == 1 ){
+ Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
+ to.setEndOffset( ti.getEndOffset() );
+ }
+ else{
+ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
+ }
+ pos = ti.getPosition();
+ }
+ text = sb.toString();
+ }
+
+ public int getStartOffset(){
+ return termsOffsets.get( 0 ).startOffset;
+ }
+
+ public int getEndOffset(){
+ return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
+ }
+
+ public boolean isOffsetOverlap( WeightedPhraseInfo other ){
+ int so = getStartOffset();
+ int eo = getEndOffset();
+ int oso = other.getStartOffset();
+ int oeo = other.getEndOffset();
+ if( so <= oso && oso <= eo ) return true;
+ if( so <= oeo && oeo <= eo ) return true;
+ if( oso <= so && so <= oeo ) return true;
+ if( oso <= eo && eo <= oeo ) return true;
+ return false;
+ }
+
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( text ).append( '(' ).append( boost ).append( ")(" );
+ for( Toffs to : termsOffsets ){
+ sb.append( to );
+ }
+ sb.append( ')' );
+ return sb.toString();
+ }
+
+ public static class Toffs {
+ int startOffset;
+ int endOffset;
+ public Toffs( int startOffset, int endOffset ){
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ }
+ void setEndOffset( int endOffset ){
+ this.endOffset = endOffset;
+ }
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
+ return sb.toString();
+ }
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,391 @@
+package org.apache.lucene.search.vectorhighlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+
+/**
+ * FieldQuery breaks down query object into terms/phrases and keep
+ * them in QueryPhraseMap structure.
+ */
+public class FieldQuery {
+
+ final boolean fieldMatch;
+
+ // fieldMatch==true, Map<fieldName,QueryPhraseMap>
+ // fieldMatch==false, Map<null,QueryPhraseMap>
+ Map<String, QueryPhraseMap> rootMaps = new HashMap<String, QueryPhraseMap>();
+
+ // fieldMatch==true, Map<fieldName,setOfTermsInQueries>
+ // fieldMatch==false, Map<null,setOfTermsInQueries>
+ Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>();
+
+ int termOrPhraseNumber; // used for colored tag support
+
+ FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){
+ this.fieldMatch = fieldMatch;
+ Set<Query> flatQueries = new HashSet<Query>();
+ flatten( query, flatQueries );
+ saveTerms( flatQueries );
+ Collection<Query> expandQueries = expand( flatQueries );
+
+ for( Query flatQuery : expandQueries ){
+ QueryPhraseMap rootMap = getRootMap( flatQuery );
+ rootMap.add( flatQuery );
+ if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
+ PhraseQuery pq = (PhraseQuery)flatQuery;
+ if( pq.getTerms().length > 1 ){
+ for( Term term : pq.getTerms() )
+ rootMap.addTerm( term, flatQuery.getBoost() );
+ }
+ }
+ }
+ }
+
+ void flatten( Query sourceQuery, Collection<Query> flatQueries ){
+ if( sourceQuery instanceof BooleanQuery ){
+ BooleanQuery bq = (BooleanQuery)sourceQuery;
+ for( BooleanClause clause : bq.getClauses() ){
+ if( !clause.isProhibited() )
+ flatten( clause.getQuery(), flatQueries );
+ }
+ }
+ else if( sourceQuery instanceof TermQuery ){
+ if( !flatQueries.contains( sourceQuery ) )
+ flatQueries.add( sourceQuery );
+ }
+ else if( sourceQuery instanceof PhraseQuery ){
+ if( !flatQueries.contains( sourceQuery ) ){
+ PhraseQuery pq = (PhraseQuery)sourceQuery;
+ if( pq.getTerms().length > 1 )
+ flatQueries.add( pq );
+ else if( pq.getTerms().length == 1 ){
+ flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
+ }
+ }
+ }
+ // else discard queries
+ }
+
+ /*
+ * Create expandQueries from flatQueries.
+ *
+ * expandQueries := flatQueries + overlapped phrase queries
+ *
+ * ex1) flatQueries={a,b,c}
+ * => expandQueries={a,b,c}
+ * ex2) flatQueries={a,"b c","c d"}
+ * => expandQueries={a,"b c","c d","b c d"}
+ */
+ Collection<Query> expand( Collection<Query> flatQueries ){
+ Set<Query> expandQueries = new HashSet<Query>();
+ for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){
+ Query query = i.next();
+ i.remove();
+ expandQueries.add( query );
+ if( !( query instanceof PhraseQuery ) ) continue;
+ for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){
+ Query qj = j.next();
+ if( !( qj instanceof PhraseQuery ) ) continue;
+ checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
+ }
+ }
+ return expandQueries;
+ }
+
+ /*
+ * Check if PhraseQuery A and B have overlapped part.
+ *
+ * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
+ * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
+ * ex3) A="a b", B="c d" => no overlap; expandQueries={}
+ */
+ private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, PhraseQuery b ){
+ if( a.getSlop() != b.getSlop() ) return;
+ Term[] ats = a.getTerms();
+ Term[] bts = b.getTerms();
+ if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
+ checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
+ checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
+ }
+
+ /*
+ * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
+ *
+ * ex1) src="a b", dest="c d" => no overlap
+ * ex2) src="a b", dest="a b c" => no overlap
+ * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"}
+ * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"}
+ * ex5) src="a b c", dest="b c" => no overlap
+ * ex6) src="a b c", dest="b" => no overlap
+ * ex7) src="a a a a", dest="a a a" => overlap;
+ * expandQueries={"a a a a a","a a a a a a"}
+ */
+ private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){
+ // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
+ // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
+ // converts PhraseQuery to TermQuery)
+ for( int i = 1; i < src.length; i++ ){
+ boolean overlap = true;
+ for( int j = i; j < src.length; j++ ){
+ if( !src[j].text().equals( dest[j-i].text() ) ){
+ overlap = false;
+ break;
+ }
+ }
+ if( overlap && src.length - i < dest.length ){
+ PhraseQuery pq = new PhraseQuery();
+ for( Term srcTerm : src )
+ pq.add( srcTerm );
+ for( int k = src.length - i; k < dest.length; k++ ){
+ pq.add( new Term( src[0].field(), dest[k].text() ) );
+ }
+ pq.setSlop( slop );
+ pq.setBoost( boost );
+ if(!expandQueries.contains( pq ) )
+ expandQueries.add( pq );
+ }
+ }
+ }
+
+ QueryPhraseMap getRootMap( Query query ){
+ String key = getKey( query );
+ QueryPhraseMap map = rootMaps.get( key );
+ if( map == null ){
+ map = new QueryPhraseMap( this );
+ rootMaps.put( key, map );
+ }
+ return map;
+ }
+
+ /*
+ * Return 'key' string. 'key' is the field name of the Query.
+ * If not fieldMatch, 'key' will be null.
+ */
+ private String getKey( Query query ){
+ if( !fieldMatch ) return null;
+ if( query instanceof TermQuery )
+ return ((TermQuery)query).getTerm().field();
+ else if ( query instanceof PhraseQuery ){
+ PhraseQuery pq = (PhraseQuery)query;
+ Term[] terms = pq.getTerms();
+ return terms[0].field();
+ }
+ else
+ throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+ }
+
+ /*
+ * Save the set of terms in the queries to termSetMap.
+ *
+ * ex1) q=name:john
+ * - fieldMatch==true
+ * termSetMap=Map<"name",Set<"john">>
+ * - fieldMatch==false
+ * termSetMap=Map<null,Set<"john">>
+ *
+ * ex2) q=name:john title:manager
+ * - fieldMatch==true
+ * termSetMap=Map<"name",Set<"john">,
+ * "title",Set<"manager">>
+ * - fieldMatch==false
+ * termSetMap=Map<null,Set<"john","manager">>
+ *
+ * ex3) q=name:"john lennon"
+ * - fieldMatch==true
+ * termSetMap=Map<"name",Set<"john","lennon">>
+ * - fieldMatch==false
+ * termSetMap=Map<null,Set<"john","lennon">>
+ */
+ void saveTerms( Collection<Query> flatQueries ){
+ for( Query query : flatQueries ){
+ Set<String> termSet = getTermSet( query );
+ if( query instanceof TermQuery )
+ termSet.add( ((TermQuery)query).getTerm().text() );
+ else if( query instanceof PhraseQuery ){
+ for( Term term : ((PhraseQuery)query).getTerms() )
+ termSet.add( term.text() );
+ }
+ else
+ throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+ }
+ }
+
+ private Set<String> getTermSet( Query query ){
+ String key = getKey( query );
+ Set<String> set = termSetMap.get( key );
+ if( set == null ){
+ set = new HashSet<String>();
+ termSetMap.put( key, set );
+ }
+ return set;
+ }
+
+ Set<String> getTermSet( String field ){
+ return termSetMap.get( fieldMatch ? field : null );
+ }
+
+ /**
+ *
+ * @param fieldName
+ * @param term
+ * @return
+ */
+ public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
+ QueryPhraseMap rootMap = getRootMap( fieldName );
+ return rootMap == null ? null : rootMap.subMap.get( term );
+ }
+
+ /**
+ *
+ * @param fieldName
+ * @param phraseCandidate
+ * @return
+ */
+ public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){
+ QueryPhraseMap root = getRootMap( fieldName );
+ if( root == null ) return null;
+ return root.searchPhrase( phraseCandidate );
+ }
+
+ private QueryPhraseMap getRootMap( String fieldName ){
+ return rootMaps.get( fieldMatch ? fieldName : null );
+ }
+
+ int nextTermOrPhraseNumber(){
+ return termOrPhraseNumber++;
+ }
+
+ public static class QueryPhraseMap {
+
+ boolean terminal;
+ int slop; // valid if terminal == true and phraseHighlight == true
+ float boost; // valid if terminal == true
+ int termOrPhraseNumber; // valid if terminal == true
+ FieldQuery fieldQuery;
+ Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
+
+ public QueryPhraseMap( FieldQuery fieldQuery ){
+ this.fieldQuery = fieldQuery;
+ }
+
+ void addTerm( Term term, float boost ){
+ QueryPhraseMap map = getOrNewMap( subMap, term.text() );
+ map.markTerminal( boost );
+ }
+
+ private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){
+ QueryPhraseMap map = subMap.get( term );
+ if( map == null ){
+ map = new QueryPhraseMap( fieldQuery );
+ subMap.put( term, map );
+ }
+ return map;
+ }
+
+ void add( Query query ){
+ if( query instanceof TermQuery ){
+ addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
+ }
+ else if( query instanceof PhraseQuery ){
+ PhraseQuery pq = (PhraseQuery)query;
+ Term[] terms = pq.getTerms();
+ Map<String, QueryPhraseMap> map = subMap;
+ QueryPhraseMap qpm = null;
+ for( Term term : terms ){
+ qpm = getOrNewMap( map, term.text() );
+ map = qpm.subMap;
+ }
+ qpm.markTerminal( pq.getSlop(), pq.getBoost() );
+ }
+ else
+ throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+ }
+
+ public QueryPhraseMap getTermMap( String term ){
+ return subMap.get( term );
+ }
+
+ private void markTerminal( float boost ){
+ markTerminal( 0, boost );
+ }
+
+ private void markTerminal( int slop, float boost ){
+ this.terminal = true;
+ this.slop = slop;
+ this.boost = boost;
+ this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
+ }
+
+ public boolean isTerminal(){
+ return terminal;
+ }
+
+ public int getSlop(){
+ return slop;
+ }
+
+ public float getBoost(){
+ return boost;
+ }
+
+ public int getTermOrPhraseNumber(){
+ return termOrPhraseNumber;
+ }
+
+ public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){
+ QueryPhraseMap currMap = this;
+ for( TermInfo ti : phraseCandidate ){
+ currMap = currMap.subMap.get( ti.getText() );
+ if( currMap == null ) return null;
+ }
+ return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
+ }
+
+ public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){
+ // check terminal
+ if( !terminal ) return false;
+
+ // if the candidate is a term, it is valid
+ if( phraseCandidate.size() == 1 ) return true;
+
+ // else check whether the candidate is valid phrase
+ // compare position-gaps between terms to slop
+ int pos = phraseCandidate.get( 0 ).getPosition();
+ for( int i = 1; i < phraseCandidate.size(); i++ ){
+ int nextPos = phraseCandidate.get( i ).getPosition();
+ if( Math.abs( nextPos - pos - 1 ) > slop ) return false;
+ pos = nextPos;
+ }
+ return true;
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,171 @@
+package org.apache.lucene.search.vectorhighlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+/**
+ * <code>FieldTermStack</code> is a stack that keeps query terms in the specified field
+ * of the document to be highlighted.
+ */
+public class FieldTermStack {
+
+ private final String fieldName;
+ LinkedList<TermInfo> termList = new LinkedList<TermInfo>();
+
+ public static void main( String[] args ) throws Exception {
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ QueryParser parser = new QueryParser( "f", analyzer );
+ Query query = parser.parse( "a x:b" );
+ FieldQuery fieldQuery = new FieldQuery( query, true, false );
+
+ Directory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter( dir, analyzer, MaxFieldLength.LIMITED );
+ Document doc = new Document();
+ doc.add( new Field( "f", "a a a b b c a b b c d e f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ doc.add( new Field( "f", "b a b a f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ writer.addDocument( doc );
+ writer.close();
+
+ IndexReader reader = IndexReader.open( dir );
+ FieldTermStack ftl = new FieldTermStack( reader, 0, "f", fieldQuery );
+ reader.close();
+ }
+
+ /**
+ * a constructor.
+ *
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fieldQuery FieldQuery object
+ * @throws IOException
+ */
+ public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
+ this.fieldName = fieldName;
+
+ TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
+ if( tfv == null ) return; // just return to make null snippets
+ TermPositionVector tpv = null;
+ try{
+ tpv = (TermPositionVector)tfv;
+ }
+ catch( ClassCastException e ){
+ return; // just return to make null snippets
+ }
+
+ Set<String> termSet = fieldQuery.getTermSet( fieldName );
+ // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
+ if( termSet == null ) return;
+
+ for( String term : tpv.getTerms() ){
+ if( !termSet.contains( term ) ) continue;
+ int index = tpv.indexOf( term );
+ TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
+ if( tvois == null ) return; // just return to make null snippets
+ int[] poss = tpv.getTermPositions( index );
+ if( poss == null ) return; // just return to make null snippets
+ for( int i = 0; i < tvois.length; i++ )
+ termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
+ }
+
+ // sort by position
+ Collections.sort( termList );
+ }
+
+ /**
+ * @return field name
+ */
+ public String getFieldName(){
+ return fieldName;
+ }
+
+ /**
+ * @return the top TermInfo object of the stack
+ */
+ public TermInfo pop(){
+ return termList.poll();
+ }
+
+ /**
+ * @param termInfo the TermInfo object to be put on the top of the stack
+ */
+ public void push( TermInfo termInfo ){
+ // termList.push( termInfo ); // avoid Java 1.6 feature
+ termList.addFirst( termInfo );
+ }
+
+ /**
+ * to know whether the stack is empty
+ *
+ * @return true if the stack is empty, false if not
+ */
+ public boolean isEmpty(){
+ return termList == null || termList.size() == 0;
+ }
+
+ public static class TermInfo implements Comparable<TermInfo>{
+
+ final String text;
+ final int startOffset;
+ final int endOffset;
+ final int position;
+
+ TermInfo( String text, int startOffset, int endOffset, int position ){
+ this.text = text;
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ this.position = position;
+ }
+
+ public String getText(){ return text; }
+ public int getStartOffset(){ return startOffset; }
+ public int getEndOffset(){ return endOffset; }
+ public int getPosition(){ return position; }
+
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' );
+ return sb.toString();
+ }
+
+ public int compareTo( TermInfo o ) {
+ return ( this.position - o.position );
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,34 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * FragListBuilder is an interface for FieldFragList builder classes.
+ * A FragListBuilder class can be plugged in to Highlighter.
+ */
+public interface FragListBuilder {
+
+ /**
+ * create a FieldFragList.
+ *
+ * @param fieldPhraseList FieldPhraseList object
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @return the created FieldFragList object
+ */
+ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize );
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,57 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * FragmentsBuilder is an interface for fragments (snippets) builder classes.
+ * A FragmentsBuilder class can be plugged in to Highlighter.
+ */
+public interface FragmentsBuilder {
+
+ /**
+ * create a fragment.
+ *
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fieldFragList FieldFragList object
+ * @return a created fragment or null when no fragment created
+ * @throws IOException
+ */
+ public String createFragment( IndexReader reader, int docId, String fieldName,
+ FieldFragList fieldFragList ) throws IOException;
+
+ /**
+ * create multiple fragments.
+ *
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighter
+ * @param fieldName field of the document to be highlighted
+ * @param fieldFragList FieldFragList object
+ * @param maxNumFragments maximum number of fragments
+ * @return created fragments or null when no fragments created.
+ * size of the array can be less than maxNumFragments
+ * @throws IOException
+ */
+ public String[] createFragments( IndexReader reader, int docId, String fieldName,
+ FieldFragList fieldFragList, int maxNumFragments ) throws IOException;
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,69 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
+
+/**
+ * An implementation of FragmentsBuilder that outputs score-order fragments.
+ */
+public class ScoreOrderFragmentsBuilder extends BaseFragmentsBuilder {
+
+ /**
+ * a constructor.
+ */
+ public ScoreOrderFragmentsBuilder(){
+ super();
+ }
+
+ /**
+ * a constructor.
+ *
+ * @param preTags aray of pre-tags for markup terms.
+ * @param postTags array of post-tags for markup terms.
+ */
+ public ScoreOrderFragmentsBuilder( String[] preTags, String[] postTags ){
+ super( preTags, postTags );
+ }
+
+ /**
+ * Sort by score the list of WeightedFragInfo
+ */
+ public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
+ Collections.sort( src, new ScoreComparator() );
+ return src;
+ }
+
+ public static class ScoreComparator implements Comparator<WeightedFragInfo> {
+
+ public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
+ if( o1.totalBoost > o2.totalBoost ) return -1;
+ else if( o1.totalBoost < o2.totalBoost ) return 1;
+ // if same score then check startOffset
+ else{
+ if( o1.startOffset < o2.startOffset ) return -1;
+ else if( o1.startOffset > o2.startOffset ) return 1;
+ }
+ return 0;
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,82 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
+
+/**
+ * A simple implementation of FragListBuilder.
+ */
+public class SimpleFragListBuilder implements FragListBuilder {
+
+ public static final int MARGIN = 6;
+ public static final int MIN_FRAG_CHAR_SIZE = MARGIN * 3;
+
+ public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) {
+ if( fragCharSize < MIN_FRAG_CHAR_SIZE )
+ throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " +
+ MIN_FRAG_CHAR_SIZE + " or higher." );
+
+ FieldFragList ffl = new FieldFragList( fragCharSize );
+
+ List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>();
+ Iterator<WeightedPhraseInfo> ite = fieldPhraseList.phraseList.iterator();
+ WeightedPhraseInfo phraseInfo = null;
+ int startOffset = 0;
+ boolean taken = false;
+ while( true ){
+ if( !taken ){
+ if( !ite.hasNext() ) break;
+ phraseInfo = ite.next();
+ }
+ taken = false;
+ if( phraseInfo == null ) break;
+
+ // if the phrase violates the border of previous fragment, discard it and try next phrase
+ if( phraseInfo.getStartOffset() < startOffset ) continue;
+
+ wpil.clear();
+ wpil.add( phraseInfo );
+ int st = phraseInfo.getStartOffset() - MARGIN < startOffset ?
+ startOffset : phraseInfo.getStartOffset() - MARGIN;
+ int en = st + fragCharSize;
+ startOffset = en;
+
+ while( true ){
+ if( ite.hasNext() ){
+ phraseInfo = ite.next();
+ taken = true;
+ if( phraseInfo == null ) break;
+ }
+ else
+ break;
+ if( phraseInfo.getEndOffset() <= en )
+ wpil.add( phraseInfo );
+ else
+ break;
+ }
+ ffl.add( st, en, wpil );
+ }
+ return ffl;
+ }
+
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,53 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
+
+/**
+ * A simple implementation of FragmentsBuilder.
+ *
+ */
+public class SimpleFragmentsBuilder extends BaseFragmentsBuilder {
+
+ /**
+ * a constructor.
+ */
+ public SimpleFragmentsBuilder() {
+ super();
+ }
+
+ /**
+ * a constructor.
+ *
+ * @param preTags array of pre-tags for markup terms.
+ * @param postTags array of post-tags for markup terms.
+ */
+ public SimpleFragmentsBuilder( String[] preTags, String[] postTags ) {
+ super( preTags, postTags );
+ }
+
+ /**
+ * do nothing. return the source list.
+ */
+ public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) {
+ return src;
+ }
+}
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html Thu Jul 9 13:06:51 2009
@@ -0,0 +1,126 @@
+<html>
+<body>
+This is an another highlighter implementation.
+
+<h2>Features</h2>
+<ul>
+<li>fast for large docs</li>
+<li>support N-gram fields</li>
+<li>support phrase-unit highlighting with slops</li>
+<li>need Java 1.5</li>
+<li>highlight fields need to be TermVector.WITH_POSITIONS_OFFSETS</li>
+<li>take into account query boost to score fragments</li>
+<li>support colored highlight tags</li>
+<li>pluggable FragListBuilder</li>
+<li>pluggable FragmentsBuilder</li>
+</ul>
+
+<h2>Algorithm</h2>
+<p>To explain the algorithm, let's use the following sample text
+ (to be highlighted) and user query:</p>
+
+<table border=1>
+<tr>
+<td><b>Sample Text</b></td>
+<td>Lucene is a search engine library.</td>
+</tr>
+<tr>
+<td><b>User Query</b></td>
+<td>Lucene^2 OR "search library"~1</td>
+</tr>
+</table>
+
+<p>The user query is a BooleanQuery that consists of TermQuery("Lucene")
+with boost of 2 and PhraseQuery("search library") with slop of 1.</p>
+<p>For your convenience, here is the offsets and positions info of the
+sample text.</p>
+
+<pre>
++--------+-----------------------------------+
+| | 1111111111222222222233333|
+| offset|01234567890123456789012345678901234|
++--------+-----------------------------------+
+|document|Lucene is a search engine library. |
++--------*-----------------------------------+
+|position|0 1 2 3 4 5 |
++--------*-----------------------------------+
+</pre>
+
+<h3>Step 1.</h3>
+<p>In Step 1, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap} from the user query.
+<code>QueryPhraseMap</code> consists of the following members:</p>
+<pre>
+public class QueryPhraseMap {
+ boolean terminal;
+ int slop; // valid if terminal == true and phraseHighlight == true
+ float boost; // valid if terminal == true
+ Map<String, QueryPhraseMap> subMap;
+}
+</pre>
+<p><code>QueryPhraseMap</code> has subMap. The key of the subMap is a term
+text in the user query and the value is a subsequent <code>QueryPhraseMap</code>.
+If the query is a term (not phrase), then the subsequent <code>QueryPhraseMap</code>
+is marked as terminal. If the query is a phrase, then the subsequent <code>QueryPhraseMap</code>
+is not a terminal and it has the next term text in the phrase.</p>
+
+<p>From the sample user query, the following <code>QueryPhraseMap</code>
+will be generated:</p>
+<pre>
+ QueryPhraseMap
++--------+-+ +-------+-+
+|"Lucene"|o+->|boost=2|*| * : terminal
++--------+-+ +-------+-+
+
++--------+-+ +---------+-+ +-------+------+-+
+|"search"|o+->|"library"|o+->|boost=1|slop=1|*|
++--------+-+ +---------+-+ +-------+------+-+
+</pre>
+
+<h3>Step 2.</h3>
+<p>In Step 2, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldTermStack}. Fast Vector Highlighter uses {@link org.apache.lucene.index.TermFreqVector} data
+(must be stored {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS_OFFSETS})
+to generate it. <code>FieldTermStack</code> keeps the terms in the user query.
+Therefore, in this sample case, Fast Vector Highlighter generates the following <code>FieldTermStack</code>:</p>
+<pre>
+ FieldTermStack
++------------------+
+|"Lucene"(0,6,0) |
++------------------+
+|"search"(12,18,3) |
++------------------+
+|"library"(26,33,5)|
++------------------+
+where : "termText"(startOffset,endOffset,position)
+</pre>
+<h3>Step 3.</h3>
+<p>In Step 3, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldPhraseList}
+by reference to <code>QueryPhraseMap</code> and <code>FieldTermStack</code>.</p>
+<pre>
+ FieldPhraseList
++----------------+-----------------+---+
+|"Lucene" |[(0,6)] |w=2|
++----------------+-----------------+---+
+|"search library"|[(12,18),(26,33)]|w=1|
++----------------+-----------------+---+
+</pre>
+<p>The type of each entry is <code>WeightedPhraseInfo</code> that consists of
+an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
+calculate the weight) will be taken into account when Fast Vector Highlighter creates
+{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.</p>
+<h3>Step 4.</h3>
+<p>In Step 4, Fast Vector Highlighter creates <code>FieldFragList</code> by reference to
+<code>FieldPhraseList</code>. In this sample case, the following
+<code>FieldFragList</code> will be generated:</p>
+<pre>
+ FieldFragList
++---------------------------------+
+|"Lucene"[(0,6)] |
+|"search library"[(12,18),(26,33)]|
+|totalBoost=3 |
++---------------------------------+
+</pre>
+<h3>Step 5.</h3>
+<p>In Step 5, by using <code>FieldFragList</code> and the field stored data,
+Fast Vector Highlighter creates highlighted snippets!</p>
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java?rev=792542&view=auto
==============================================================================
--- lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (added)
+++ lucene/java/trunk/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java Thu Jul 9 13:06:51 2009
@@ -0,0 +1,345 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+public abstract class AbstractTestCase extends TestCase {
+
+ protected final String F = "f";
+ protected final String F1 = "f1";
+ protected final String F2 = "f2";
+ protected Directory dir;
+ protected Analyzer analyzerW;
+ protected Analyzer analyzerB;
+ protected IndexReader reader;
+ protected QueryParser paW;
+ protected QueryParser paB;
+
+ protected static final String[] shortMVValues = {
+ "a b c",
+ "", // empty data in multi valued field
+ "d e"
+ };
+
+ protected static final String[] longMVValues = {
+ "Followings are the examples of customizable parameters and actual examples of customization:",
+ "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
+ };
+
+ // test data for LUCENE-1448 bug
+ protected static final String[] biMVValues = {
+ "\nLucene/Solr does not require such additional hardware.",
+ "\nWhen you talk about processing speed, the"
+ };
+
+ protected void setUp() throws Exception {
+ analyzerW = new WhitespaceAnalyzer();
+ analyzerB = new BigramAnalyzer();
+ paW = new QueryParser( F, analyzerW );
+ paB = new QueryParser( F, analyzerB );
+ dir = new RAMDirectory();
+ }
+
+ protected void tearDown() throws Exception {
+ if( reader != null ){
+ reader.close();
+ reader = null;
+ }
+ }
+
+ protected Query tq( String text ){
+ return tq( 1F, text );
+ }
+
+ protected Query tq( float boost, String text ){
+ return tq( boost, F, text );
+ }
+
+ protected Query tq( String field, String text ){
+ return tq( 1F, field, text );
+ }
+
+ protected Query tq( float boost, String field, String text ){
+ Query query = new TermQuery( new Term( field, text ) );
+ query.setBoost( boost );
+ return query;
+ }
+
+ protected Query pqF( String... texts ){
+ return pqF( 1F, texts );
+ }
+
+ protected Query pqF( float boost, String... texts ){
+ return pqF( boost, 0, texts );
+ }
+
+ protected Query pqF( float boost, int slop, String... texts ){
+ return pq( boost, slop, F, texts );
+ }
+
+ protected Query pq( String field, String... texts ){
+ return pq( 1F, 0, field, texts );
+ }
+
+ protected Query pq( float boost, String field, String... texts ){
+ return pq( boost, 0, field, texts );
+ }
+
+ protected Query pq( float boost, int slop, String field, String... texts ){
+ PhraseQuery query = new PhraseQuery();
+ for( String text : texts ){
+ query.add( new Term( field, text ) );
+ }
+ query.setBoost( boost );
+ query.setSlop( slop );
+ return query;
+ }
+
+ protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
+ assertEquals( expected.length, actual.size() );
+ for( Query query : expected ){
+ assertTrue( actual.contains( query ) );
+ }
+ }
+
+ static class BigramAnalyzer extends Analyzer {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new BasicNGramTokenizer( reader );
+ }
+ }
+
+ static class BasicNGramTokenizer extends Tokenizer {
+
+ public static final int DEFAULT_N_SIZE = 2;
+ public static final String DEFAULT_DELIMITERS = " \t\n.,";
+ private final int n;
+ private final String delimiters;
+ private int startTerm;
+ private int lenTerm;
+ private int startOffset;
+ private int nextStartOffset;
+ private int ch;
+ private String snippet;
+ private StringBuilder snippetBuffer;
+ private static final int BUFFER_SIZE = 4096;
+ private char[] charBuffer;
+ private int charBufferIndex;
+ private int charBufferLen;
+
+ public BasicNGramTokenizer( Reader in ){
+ this( in, DEFAULT_N_SIZE );
+ }
+
+ public BasicNGramTokenizer( Reader in, int n ){
+ this( in, n, DEFAULT_DELIMITERS );
+ }
+
+ public BasicNGramTokenizer( Reader in, String delimiters ){
+ this( in, DEFAULT_N_SIZE, delimiters );
+ }
+
+ public BasicNGramTokenizer( Reader in, int n, String delimiters ){
+ super(in);
+ this.n = n;
+ this.delimiters = delimiters;
+ startTerm = 0;
+ nextStartOffset = 0;
+ snippet = null;
+ snippetBuffer = new StringBuilder();
+ charBuffer = new char[BUFFER_SIZE];
+ charBufferIndex = BUFFER_SIZE;
+ charBufferLen = 0;
+ ch = 0;
+ }
+
+ public Token next( Token reusableToken ) throws IOException {
+ if( !getNextPartialSnippet() )
+ return null;
+ reusableToken.reinit( snippet, startTerm, lenTerm, startOffset, startOffset + lenTerm );
+ return reusableToken;
+ }
+
+ public int getFinalOffset() {
+ return nextStartOffset;
+ }
+
+ protected boolean getNextPartialSnippet() throws IOException {
+ if( snippet != null && snippet.length() >= startTerm + 1 + n ){
+ startTerm++;
+ startOffset++;
+ lenTerm = n;
+ return true;
+ }
+ return getNextSnippet();
+ }
+
+ protected boolean getNextSnippet() throws IOException {
+ startTerm = 0;
+ startOffset = nextStartOffset;
+ snippetBuffer.delete( 0, snippetBuffer.length() );
+ while( true ){
+ if( ch != -1 )
+ ch = readCharFromBuffer();
+ if( ch == -1 ) break;
+ else if( !isDelimiter( ch ) )
+ snippetBuffer.append( (char)ch );
+ else if( snippetBuffer.length() > 0 )
+ break;
+ else
+ startOffset++;
+ }
+ if( snippetBuffer.length() == 0 )
+ return false;
+ snippet = snippetBuffer.toString();
+ lenTerm = snippet.length() >= n ? n : snippet.length();
+ return true;
+ }
+
+ protected int readCharFromBuffer() throws IOException {
+ if( charBufferIndex >= charBufferLen ){
+ charBufferLen = input.read( charBuffer );
+ if( charBufferLen == -1 ){
+ return -1;
+ }
+ charBufferIndex = 0;
+ }
+ int c = (int)charBuffer[charBufferIndex++];
+ nextStartOffset++;
+ return c;
+ }
+
+ protected boolean isDelimiter( int c ){
+ return delimiters.indexOf( c ) >= 0;
+ }
+ }
+
+ protected void make1d1fIndex( String value ) throws Exception {
+ make1dmfIndex( value );
+ }
+
+ protected void make1d1fIndexB( String value ) throws Exception {
+ make1dmfIndexB( value );
+ }
+
+ protected void make1dmfIndex( String... values ) throws Exception {
+ make1dmfIndex( analyzerW, values );
+ }
+
+ protected void make1dmfIndexB( String... values ) throws Exception {
+ make1dmfIndex( analyzerB, values );
+ }
+
+ protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
+ IndexWriter writer = new IndexWriter( dir, analyzer, true, MaxFieldLength.LIMITED );
+ Document doc = new Document();
+ for( String value: values )
+ doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ writer.addDocument( doc );
+ writer.close();
+
+ reader = IndexReader.open( dir );
+ }
+
+ protected void makeIndexShortMV() throws Exception {
+
+ // 012345
+ // "a b c"
+ // 0 1 2
+
+ // ""
+
+ // 6789
+ // "d e"
+ // 3 4
+ make1dmfIndex( shortMVValues );
+ }
+
+ protected void makeIndexLongMV() throws Exception {
+ // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
+ // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+ // Followings are the examples of customizable parameters and actual examples of customization:
+ // 0 1 2 3 4 5 6 7 8 9 10 11
+
+ // 1 2
+ // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
+ // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+ // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
+ // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
+
+ make1dmfIndex( longMVValues );
+ }
+
+ protected void makeIndexLongMVB() throws Exception {
+ // "*" ... LF
+
+ // 1111111111222222222233333333334444444444555555
+ // 01234567890123456789012345678901234567890123456789012345
+ // *Lucene/Solr does not require such additional hardware.
+ // Lu 0 do 10 re 15 su 21 na 31
+ // uc 1 oe 11 eq 16 uc 22 al 32
+ // ce 2 es 12 qu 17 ch 23 ha 33
+ // en 3 no 13 ui 18 ad 24 ar 34
+ // ne 4 ot 14 ir 19 dd 25 rd 35
+ // e/ 5 re 20 di 26 dw 36
+ // /S 6 it 27 wa 37
+ // So 7 ti 28 ar 38
+ // ol 8 io 29 re 39
+ // lr 9 on 30
+
+ // 5555666666666677777777778888888888999999999
+ // 6789012345678901234567890123456789012345678
+ // *When you talk about processing speed, the
+ // Wh 40 ab 48 es 56 th 65
+ // he 41 bo 49 ss 57 he 66
+ // en 42 ou 50 si 58
+ // yo 43 ut 51 in 59
+ // ou 44 pr 52 ng 60
+ // ta 45 ro 53 sp 61
+ // al 46 oc 54 pe 62
+ // lk 47 ce 55 ee 63
+ // ed 64
+
+ make1dmfIndexB( biMVValues );
+ }
+}