You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/11/12 22:08:53 UTC
svn commit: r1201328 - in /lucene/dev/branches/lucene2621:
lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/
lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/
lucene/contrib/highlighter/src/test/org/apa...
Author: mikemccand
Date: Sat Nov 12 21:08:53 2011
New Revision: 1201328
URL: http://svn.apache.org/viewvc?rev=1201328&view=rev
Log:
LUCENE-2621: modules tests pass
Modified:
lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Sat Nov 12 21:08:53 2011
@@ -32,7 +32,11 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -66,12 +70,14 @@ public class TokenSources {
String field, Document doc, Analyzer analyzer) throws IOException {
TokenStream ts = null;
- TermFreqVector tfv = reader.getTermFreqVector(docId, field);
- if (tfv != null) {
- if (tfv instanceof TermPositionVector) {
- ts = getTokenStream((TermPositionVector) tfv);
+ Fields vectors = reader.getTermVectors(docId);
+ if (vectors != null) {
+ Terms vector = vectors.terms(field);
+ if (vector != null) {
+ ts = getTokenStream(vector);
}
}
+
// No token info stored so fall back to analyzing raw content
if (ts == null) {
ts = getTokenStream(doc, field, analyzer);
@@ -96,12 +102,14 @@ public class TokenSources {
String field, Analyzer analyzer) throws IOException {
TokenStream ts = null;
- TermFreqVector tfv = reader.getTermFreqVector(docId, field);
- if (tfv != null) {
- if (tfv instanceof TermPositionVector) {
- ts = getTokenStream((TermPositionVector) tfv);
+ Fields vectors = reader.getTermVectors(docId);
+ if (vectors != null) {
+ Terms vector = vectors.terms(field);
+ if (vector != null) {
+ ts = getTokenStream(vector);
}
}
+
// No token info stored so fall back to analyzing raw content
if (ts == null) {
ts = getTokenStream(reader, docId, field, analyzer);
@@ -109,10 +117,25 @@ public class TokenSources {
return ts;
}
- public static TokenStream getTokenStream(TermPositionVector tpv) {
+ public static TokenStream getTokenStream(Terms vector) throws IOException {
// assumes the worst and makes no assumptions about token position
// sequences.
- return getTokenStream(tpv, false);
+ return getTokenStream(vector, false);
+ }
+
+ private static boolean hasPositions(Terms vector) throws IOException {
+ final TermsEnum termsEnum = vector.iterator();
+ if (termsEnum.next() != null) {
+ DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
+ if (dpEnum != null) {
+ int pos = dpEnum.nextPosition();
+ if (pos >= 0) {
+ return true;
+ }
+ }
+ }
+
+ return false;
}
/**
@@ -141,9 +164,10 @@ public class TokenSources {
* numbers have no overlaps or gaps. If looking to eek out the last
* drops of performance, set to true. If in doubt, set to false.
*/
- public static TokenStream getTokenStream(TermPositionVector tpv,
- boolean tokenPositionsGuaranteedContiguous) {
- if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) {
+ public static TokenStream getTokenStream(Terms tpv,
+ boolean tokenPositionsGuaranteedContiguous)
+ throws IOException {
+ if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) {
return new TokenStreamFromTermPositionVector(tpv);
}
@@ -183,56 +207,57 @@ public class TokenSources {
}
}
// code to reconstruct the original sequence of Tokens
- BytesRef[] terms = tpv.getTerms();
- int[] freq = tpv.getTermFrequencies();
+ TermsEnum termsEnum = tpv.iterator();
int totalTokens = 0;
- for (int t = 0; t < freq.length; t++) {
- totalTokens += freq[t];
+ while(termsEnum.next() != null) {
+ totalTokens += (int) termsEnum.totalTermFreq();
}
Token tokensInOriginalOrder[] = new Token[totalTokens];
ArrayList<Token> unsortedTokens = null;
- for (int t = 0; t < freq.length; t++) {
- TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
- if (offsets == null) {
+ termsEnum = tpv.iterator();
+ BytesRef text;
+ DocsAndPositionsEnum dpEnum = null;
+ while ((text = termsEnum.next()) != null) {
+
+ dpEnum = termsEnum.docsAndPositions(null, dpEnum);
+ if (dpEnum == null || (!dpEnum.attributes().hasAttribute(OffsetAttribute.class))) {
throw new IllegalArgumentException(
"Required TermVector Offset information was not found");
}
- int[] pos = null;
- if (tokenPositionsGuaranteedContiguous) {
- // try get the token position info to speed up assembly of tokens into
- // sorted sequence
- pos = tpv.getTermPositions(t);
- }
- if (pos == null) {
- // tokens NOT stored with positions or not guaranteed contiguous - must
- // add to list and sort later
- if (unsortedTokens == null) {
- unsortedTokens = new ArrayList<Token>();
- }
- for (int tp = 0; tp < offsets.length; tp++) {
- Token token = new Token(terms[t].utf8ToString(),
- offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
+ final String term = text.utf8ToString();
+
+ final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
+ dpEnum.nextDoc();
+ final int freq = dpEnum.freq();
+ for(int posUpto=0;posUpto<freq;posUpto++) {
+ final int pos = dpEnum.nextPosition();
+ final Token token = new Token(term,
+ offsetAtt.startOffset(),
+ offsetAtt.endOffset());
+ if (tokenPositionsGuaranteedContiguous && pos != -1) {
+ // We have positions stored and a guarantee that the token position
+ // information is contiguous
+
+ // This may be fast BUT wont work if Tokenizers used which create >1
+ // token in same position or
+ // creates jumps in position numbers - this code would fail under those
+ // circumstances
+
+ // tokens stored with positions - can use this to index straight into
+ // sorted array
+ tokensInOriginalOrder[pos] = token;
+ } else {
+ // tokens NOT stored with positions or not guaranteed contiguous - must
+ // add to list and sort later
+ if (unsortedTokens == null) {
+ unsortedTokens = new ArrayList<Token>();
+ }
unsortedTokens.add(token);
}
- } else {
- // We have positions stored and a guarantee that the token position
- // information is contiguous
-
- // This may be fast BUT wont work if Tokenizers used which create >1
- // token in same position or
- // creates jumps in position numbers - this code would fail under those
- // circumstances
-
- // tokens stored with positions - can use this to index straight into
- // sorted array
- for (int tp = 0; tp < pos.length; tp++) {
- Token token = new Token(terms[t].utf8ToString(),
- offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
- tokensInOriginalOrder[pos[tp]] = token;
- }
}
}
+
// If the field has been stored without position data we must perform a sort
if (unsortedTokens != null) {
tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
@@ -250,18 +275,25 @@ public class TokenSources {
public static TokenStream getTokenStream(IndexReader reader, int docId,
String field) throws IOException {
- TermFreqVector tfv = reader.getTermFreqVector(docId, field);
- if (tfv == null) {
+
+ Fields vectors = reader.getTermVectors(docId);
+ if (vectors == null) {
throw new IllegalArgumentException(field + " in doc #" + docId
+ "does not have any term position data stored");
}
- if (tfv instanceof TermPositionVector) {
- TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
- docId, field);
- return getTokenStream(tpv);
+
+ Terms vector = vectors.terms(field);
+ if (vector == null) {
+ throw new IllegalArgumentException(field + " in doc #" + docId
+ + "does not have any term position data stored");
+ }
+
+ if (!hasPositions(vector)) {
+ throw new IllegalArgumentException(field + " in doc #" + docId
+ + "does not have any term position data stored");
}
- throw new IllegalArgumentException(field + " in doc #" + docId
- + "does not have any term position data stored");
+
+ return getTokenStream(vector);
}
// convenience method
Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java Sat Nov 12 21:08:53 2011
@@ -27,8 +27,9 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
@@ -51,27 +52,41 @@ public final class TokenStreamFromTermPo
* creating the TokenStream. Must have positions and offsets.
*/
public TokenStreamFromTermPositionVector(
- final TermPositionVector termPositionVector) {
+ final Terms vector) throws IOException {
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
- final BytesRef[] terms = termPositionVector.getTerms();
- for (int i = 0; i < terms.length; i++) {
- final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i);
- final int[] termPositions = termPositionVector.getTermPositions(i);
- for (int j = 0; j < termPositions.length; j++) {
+ final TermsEnum termsEnum = vector.iterator();
+ BytesRef text;
+ // nocommit find all places where I "blindly" added
+ // calls to .getAttribute(OffsetAttr): these are wrong.
+ // instead i must check .hasAttr first
+ DocsAndPositionsEnum dpEnum = null;
+ while((text = termsEnum.next()) != null) {
+ dpEnum = termsEnum.docsAndPositions(null, dpEnum);
+ dpEnum.nextDoc();
+ final int freq = dpEnum.freq();
+ final OffsetAttribute offsetAtt;
+ if (dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
+ offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
+ } else {
+ offsetAtt = null;
+ }
+ for (int j = 0; j < freq; j++) {
+ int pos = dpEnum.nextPosition();
Token token;
- if (offsets != null) {
- token = new Token(terms[i].utf8ToString(),
- offsets[j].getStartOffset(), offsets[j].getEndOffset());
+ if (offsetAtt != null) {
+ token = new Token(text.utf8ToString(),
+ offsetAtt.startOffset(),
+ offsetAtt.endOffset());
} else {
token = new Token();
- token.setEmpty().append(terms[i].utf8ToString());
+ token.setEmpty().append(text.utf8ToString());
}
// Yes - this is the position, not the increment! This is for
// sorting. This value
// will be corrected before use.
- token.setPositionIncrement(termPositions[j]);
+ token.setPositionIncrement(pos);
this.positionedTokens.add(token);
}
}
Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Sat Nov 12 21:08:53 2011
@@ -21,10 +21,12 @@ import java.util.Collections;
import java.util.LinkedList;
import java.util.Set;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
@@ -76,30 +78,55 @@ public class FieldTermStack {
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
- TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
- if( tfv == null ) return; // just return to make null snippets
- TermPositionVector tpv = null;
- try{
- tpv = (TermPositionVector)tfv;
+ final Fields vectors = reader.getTermVectors(docId);
+ if (vectors == null) {
+ // null snippet
+ return;
}
- catch( ClassCastException e ){
- return; // just return to make null snippets
+
+ final Terms vector = vectors.terms(fieldName);
+ if (vector == null) {
+ // null snippet
+ return;
}
-
+
final CharsRef spare = new CharsRef();
- for( BytesRef term : tpv.getTerms() ){
- if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;
- int index = tpv.indexOf( term );
- TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
- if( tvois == null ) return; // just return to make null snippets
- int[] poss = tpv.getTermPositions( index );
- if( poss == null ) return; // just return to make null snippets
- for( int i = 0; i < tvois.length; i++ )
- termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
+ final TermsEnum termsEnum = vector.iterator();
+ DocsAndPositionsEnum dpEnum = null;
+ BytesRef text;
+ while ((text = termsEnum.next()) != null) {
+ final String term = text.utf8ToChars(spare).toString();
+ if (!termSet.contains(term)) {
+ continue;
+ }
+ dpEnum = termsEnum.docsAndPositions(null, dpEnum);
+ if (dpEnum == null) {
+ // null snippet
+ return;
+ }
+
+ if (!dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
+ // null snippet
+ return;
+ }
+ dpEnum.nextDoc();
+
+ final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
+
+ final int freq = dpEnum.freq();
+
+ for(int i = 0;i < freq;i++) {
+ final int pos = dpEnum.nextPosition();
+ if (pos == -1) {
+ // null snippet
+ return;
+ }
+ termList.add(new TermInfo(term, offsetAtt.startOffset(), offsetAtt.endOffset(), pos));
+ }
}
// sort by position
- Collections.sort( termList );
+ Collections.sort(termList);
}
/**
Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java (original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java Sat Nov 12 21:08:53 2011
@@ -34,7 +34,6 @@ import org.apache.lucene.index.CorruptIn
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
@@ -85,7 +84,7 @@ public class HighlighterPhraseTest exten
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
- .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+ .getTokenStream(indexReader.getTermVector(
0, FIELD), false);
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
@@ -160,7 +159,7 @@ public class HighlighterPhraseTest exten
.nextSetBit(position + 1)) {
assertEquals(0, position);
final TokenStream tokenStream = TokenSources.getTokenStream(
- (TermPositionVector) indexReader.getTermFreqVector(position,
+ indexReader.getTermVector(position,
FIELD), false);
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
@@ -207,7 +206,7 @@ public class HighlighterPhraseTest exten
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
- .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+ .getTokenStream(indexReader.getTermVector(
0, FIELD), false);
assertEquals(
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
@@ -253,7 +252,7 @@ public class HighlighterPhraseTest exten
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources.getTokenStream(
- (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), true);
+ indexReader.getTermVector(0, FIELD), true);
assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
.getBestFragment(tokenStream, TEXT));
} finally {
@@ -297,7 +296,7 @@ public class HighlighterPhraseTest exten
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
- .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+ .getTokenStream(indexReader.getTermVector(
0, FIELD), false);
assertEquals(
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java Sat Nov 12 21:08:53 2011
@@ -32,7 +32,6 @@ import org.apache.lucene.index.CorruptIn
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@@ -133,7 +132,7 @@ public class TokenSourcesTest extends Lu
new QueryScorer(query));
final TokenStream tokenStream = TokenSources
.getTokenStream(
- (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+ indexReader.getTermVector(0, FIELD),
false);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
@@ -182,7 +181,7 @@ public class TokenSourcesTest extends Lu
new QueryScorer(query));
final TokenStream tokenStream = TokenSources
.getTokenStream(
- (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+ indexReader.getTermVector(0, FIELD),
false);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
@@ -230,7 +229,7 @@ public class TokenSourcesTest extends Lu
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
- (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+ indexReader.getTermVector(0, FIELD),
false);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
@@ -279,7 +278,7 @@ public class TokenSourcesTest extends Lu
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
- (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+ indexReader.getTermVector(0, FIELD),
false);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
Modified: lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java (original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java Sat Nov 12 21:08:53 2011
@@ -1,7 +1,5 @@
package org.apache.lucene.store.instantiated;
-import org.apache.lucene.index.TermVectorOffsetInfo;
-
import java.util.Comparator;
/**
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java Sat Nov 12 21:08:53 2011
@@ -804,9 +804,19 @@ public abstract class IndexReader implem
}
// nocommit javadoc
- abstract public Fields getTermVectors(int docNumber)
+ abstract public Fields getTermVectors(int docID)
throws IOException;
+ // nocommit javadoc
+ public Terms getTermVector(int docID, String field)
+ throws IOException {
+ Fields vectors = getTermVectors(docID);
+ if (vectors == null) {
+ return null;
+ }
+ return vectors.terms(field);
+ }
+
/**
* Returns <code>true</code> if an index exists at the specified directory.
* @param directory the directory to check for an index
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java Sat Nov 12 21:08:53 2011
@@ -331,15 +331,19 @@ public class DefaultTermVectorsReader ex
private class TVTerms extends Terms {
private final int numTerms;
private final int docID;
+ private final long tvfFPStart;
public TVTerms(int docID, long tvfFP) throws IOException {
this.docID = docID;
tvf.seek(tvfFP);
numTerms = tvf.readVInt();
+ tvfFPStart = tvf.getFilePointer();
}
@Override
public TermsEnum iterator() throws IOException {
+ // nocommit -- to be "safe" we should clone tvf here...?
+ tvf.seek(tvfFPStart);
return new TVTermsEnum(docID, numTerms);
}
Modified: lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (original)
+++ lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java Sat Nov 12 21:08:53 2011
@@ -15,14 +15,19 @@
*/
package org.apache.lucene.queries.mlt;
+import java.io.*;
+import java.util.*;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.*;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
@@ -30,9 +35,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
-import java.io.*;
-import java.util.*;
-
/**
* Generate "more like this" similarity queries.
@@ -701,7 +703,13 @@ public final class MoreLikeThis {
public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
Map<String, Int> termFreqMap = new HashMap<String, Int>();
for (String fieldName : fieldNames) {
- TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
+ final Fields vectors = ir.getTermVectors(docNum);
+ final Terms vector;
+ if (vectors != null) {
+ vector = vectors.terms(fieldName);
+ } else {
+ vector = null;
+ }
// field does not store term vector info
if (vector == null) {
@@ -716,7 +724,6 @@ public final class MoreLikeThis {
} else {
addTermFrequencies(termFreqMap, vector);
}
-
}
return createQueue(termFreqMap);
@@ -728,24 +735,25 @@ public final class MoreLikeThis {
* @param termFreqMap a Map of terms and their frequencies
* @param vector List of terms and their frequencies for a doc/field
*/
- private void addTermFrequencies(Map<String, Int> termFreqMap, TermFreqVector vector) {
- BytesRef[] terms = vector.getTerms();
- int freqs[] = vector.getTermFrequencies();
+ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
+ final TermsEnum termsEnum = vector.iterator();
final CharsRef spare = new CharsRef();
- for (int j = 0; j < terms.length; j++) {
- final String term = terms[j].utf8ToChars(spare).toString();
-
+ BytesRef text;
+ while((text = termsEnum.next()) != null) {
+ final String term = text.utf8ToChars(spare).toString();
if (isNoiseWord(term)) {
continue;
}
+ final int freq = (int) termsEnum.totalTermFreq();
+
// increment frequency
Int cnt = termFreqMap.get(term);
if (cnt == null) {
cnt = new Int();
termFreqMap.put(term, cnt);
- cnt.x = freqs[j];
+ cnt.x = freq;
} else {
- cnt.x += freqs[j];
+ cnt.x += freq;
}
}
}