You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by bu...@apache.org on 2010/07/22 21:34:52 UTC
svn commit: r966819 [7/20] - in /lucene/dev/branches/realtime_search: ./
lucene/ lucene/backwards/ lucene/contrib/ lucene/contrib/benchmark/conf/
lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/
lucene/contrib/benchmark/src/j...
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java Thu Jul 22 19:34:35 2010
@@ -17,7 +17,7 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@@ -33,7 +33,9 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
/**
* This class is generated by JavaCC. The most important method is
@@ -107,6 +109,8 @@ import org.apache.lucene.util.Version;
* <ul>
* <li> As of 2.9, {@link #setEnablePositionIncrements} is true by
* default.
+ * <li> As of 3.1, {@link #setAutoGeneratePhraseQueries} is false by
+ * default.
* </ul>
*/
public class QueryParser implements QueryParserConstants {
@@ -150,6 +154,19 @@ public class QueryParser implements Quer
// for use when constructing RangeQuerys.
Collator rangeCollator = null;
+ /** @deprecated remove when getFieldQuery is removed */
+ private static final VirtualMethod<QueryParser> getFieldQueryMethod =
+ new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class);
+ /** @deprecated remove when getFieldQuery is removed */
+ private static final VirtualMethod<QueryParser> getFieldQueryWithQuotedMethod =
+ new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class, boolean.class);
+ /** @deprecated remove when getFieldQuery is removed */
+ private final boolean hasNewAPI =
+ VirtualMethod.compareImplementationDistance(getClass(),
+ getFieldQueryWithQuotedMethod, getFieldQueryMethod) >= 0; // its ok for both to be overridden
+
+ private boolean autoGeneratePhraseQueries;
+
/** The default operator for parsing queries.
* Use {@link QueryParser#setDefaultOperator} to change it.
*/
@@ -169,6 +186,11 @@ public class QueryParser implements Quer
} else {
enablePositionIncrements = false;
}
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ setAutoGeneratePhraseQueries(false);
+ } else {
+ setAutoGeneratePhraseQueries(true);
+ }
}
/** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@@ -214,6 +236,29 @@ public class QueryParser implements Quer
return field;
}
+ /**
+ * @see #setAutoGeneratePhraseQueries(boolean)
+ */
+ public final boolean getAutoGeneratePhraseQueries() {
+ return autoGeneratePhraseQueries;
+ }
+
+ /**
+ * Set to true if phrase queries will be automatically generated
+ * when the analyzer returns more than one term from whitespace
+ * delimited text.
+ * NOTE: this behavior may not be suitable for all languages.
+ * <p>
+ * Set to false if phrase queries should only be generated when
+ * surrounded by double quotes.
+ */
+ public final void setAutoGeneratePhraseQueries(boolean value) {
+ if (value == false && !hasNewAPI)
+ throw new IllegalArgumentException("You must implement the new API: getFieldQuery(String,String,boolean)"
+ + " to use setAutoGeneratePhraseQueries(false)");
+ this.autoGeneratePhraseQueries = value;
+ }
+
/**
* Get the minimal similarity for fuzzy queries.
*/
@@ -506,11 +551,19 @@ public class QueryParser implements Quer
throw new RuntimeException("Clause cannot be both required and prohibited");
}
+ /**
+ * @deprecated Use {@link #getFieldQuery(String,String,boolean)} instead.
+ */
+ @Deprecated
+ protected Query getFieldQuery(String field, String queryText) throws ParseException {
+ // treat the text as if it was quoted, to drive phrase logic with old versions.
+ return getFieldQuery(field, queryText, true);
+ }
/**
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field, String queryText) throws ParseException {
+ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
@@ -522,7 +575,7 @@ public class QueryParser implements Quer
source = analyzer.tokenStream(field, new StringReader(queryText));
}
CachingTokenFilter buffer = new CachingTokenFilter(source);
- CharTermAttribute termAtt = null;
+ TermToBytesRefAttribute termAtt = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
@@ -534,8 +587,8 @@ public class QueryParser implements Quer
// success==false if we hit an exception
}
if (success) {
- if (buffer.hasAttribute(CharTermAttribute.class)) {
- termAtt = buffer.getAttribute(CharTermAttribute.class);
+ if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
+ termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
}
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
@@ -577,33 +630,37 @@ public class QueryParser implements Quer
if (numTokens == 0)
return null;
else if (numTokens == 1) {
- String term = null;
+ BytesRef term = new BytesRef();
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
return newTermQuery(new Term(field, term));
} else {
- if (severalTokensAtSamePosition) {
- if (positionCount == 1) {
+ if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
+ if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
// no phrase query:
- BooleanQuery q = newBooleanQuery(true);
+ BooleanQuery q = newBooleanQuery(positionCount == 1);
+
+ BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ?
+ BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
+
for (int i = 0; i < numTokens; i++) {
- String term = null;
+ BytesRef term = new BytesRef();
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
Query currentQuery = newTermQuery(
new Term(field, term));
- q.add(currentQuery, BooleanClause.Occur.SHOULD);
+ q.add(currentQuery, occur);
}
return q;
}
@@ -614,12 +671,12 @@ public class QueryParser implements Quer
List<Term> multiTerms = new ArrayList<Term>();
int position = -1;
for (int i = 0; i < numTokens; i++) {
- String term = null;
+ BytesRef term = new BytesRef();
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
@@ -653,13 +710,13 @@ public class QueryParser implements Quer
for (int i = 0; i < numTokens; i++) {
- String term = null;
+ BytesRef term = new BytesRef();
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
@@ -682,7 +739,7 @@ public class QueryParser implements Quer
/**
- * Base implementation delegates to {@link #getFieldQuery(String,String)}.
+ * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}.
* This method may be overridden, for example, to return
* a SpanNearQuery instead of a PhraseQuery.
*
@@ -690,7 +747,7 @@ public class QueryParser implements Quer
*/
protected Query getFieldQuery(String field, String queryText, int slop)
throws ParseException {
- Query query = getFieldQuery(field, queryText);
+ Query query = hasNewAPI ? getFieldQuery(field, queryText, true) : getFieldQuery(field, queryText);
if (query instanceof PhraseQuery) {
((PhraseQuery) query).setSlop(slop);
@@ -1343,7 +1400,7 @@ public class QueryParser implements Quer
}
q = getFuzzyQuery(field, termImage,fms);
} else {
- q = getFieldQuery(field, termImage);
+ q = hasNewAPI ? getFieldQuery(field, termImage, false) : getFieldQuery(field, termImage);
}
break;
case RANGEIN_START:
@@ -1512,12 +1569,6 @@ public class QueryParser implements Quer
finally { jj_save(0, xla); }
}
- private boolean jj_3R_2() {
- if (jj_scan_token(TERM)) return true;
- if (jj_scan_token(COLON)) return true;
- return false;
- }
-
private boolean jj_3_1() {
Token xsp;
xsp = jj_scanpos;
@@ -1534,6 +1585,12 @@ public class QueryParser implements Quer
return false;
}
+ private boolean jj_3R_2() {
+ if (jj_scan_token(TERM)) return true;
+ if (jj_scan_token(COLON)) return true;
+ return false;
+ }
+
/** Generated Token Manager. */
public QueryParserTokenManager token_source;
/** Current token. */
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj Thu Jul 22 19:34:35 2010
@@ -41,7 +41,7 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@@ -57,7 +57,9 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
/**
* This class is generated by JavaCC. The most important method is
@@ -131,6 +133,8 @@ import org.apache.lucene.util.Version;
* <ul>
* <li> As of 2.9, {@link #setEnablePositionIncrements} is true by
* default.
+ * <li> As of 3.1, {@link #setAutoGeneratePhraseQueries} is false by
+ * default.
* </ul>
*/
public class QueryParser {
@@ -174,6 +178,19 @@ public class QueryParser {
// for use when constructing RangeQuerys.
Collator rangeCollator = null;
+ /** @deprecated remove when getFieldQuery is removed */
+ private static final VirtualMethod<QueryParser> getFieldQueryMethod =
+ new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class);
+ /** @deprecated remove when getFieldQuery is removed */
+ private static final VirtualMethod<QueryParser> getFieldQueryWithQuotedMethod =
+ new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class, boolean.class);
+ /** @deprecated remove when getFieldQuery is removed */
+ private final boolean hasNewAPI =
+ VirtualMethod.compareImplementationDistance(getClass(),
+ getFieldQueryWithQuotedMethod, getFieldQueryMethod) >= 0; // its ok for both to be overridden
+
+ private boolean autoGeneratePhraseQueries;
+
/** The default operator for parsing queries.
* Use {@link QueryParser#setDefaultOperator} to change it.
*/
@@ -193,6 +210,11 @@ public class QueryParser {
} else {
enablePositionIncrements = false;
}
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ setAutoGeneratePhraseQueries(false);
+ } else {
+ setAutoGeneratePhraseQueries(true);
+ }
}
/** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@@ -238,6 +260,29 @@ public class QueryParser {
return field;
}
+ /**
+ * @see #setAutoGeneratePhraseQueries(boolean)
+ */
+ public final boolean getAutoGeneratePhraseQueries() {
+ return autoGeneratePhraseQueries;
+ }
+
+ /**
+ * Set to true if phrase queries will be automatically generated
+ * when the analyzer returns more than one term from whitespace
+ * delimited text.
+ * NOTE: this behavior may not be suitable for all languages.
+ * <p>
+ * Set to false if phrase queries should only be generated when
+ * surrounded by double quotes.
+ */
+ public final void setAutoGeneratePhraseQueries(boolean value) {
+ if (value == false && !hasNewAPI)
+ throw new IllegalArgumentException("You must implement the new API: getFieldQuery(String,String,boolean)"
+ + " to use setAutoGeneratePhraseQueries(false)");
+ this.autoGeneratePhraseQueries = value;
+ }
+
/**
* Get the minimal similarity for fuzzy queries.
*/
@@ -530,11 +575,19 @@ public class QueryParser {
throw new RuntimeException("Clause cannot be both required and prohibited");
}
+ /**
+ * @deprecated Use {@link #getFieldQuery(String,String,boolean)} instead.
+ */
+ @Deprecated
+ protected Query getFieldQuery(String field, String queryText) throws ParseException {
+ // treat the text as if it was quoted, to drive phrase logic with old versions.
+ return getFieldQuery(field, queryText, true);
+ }
/**
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field, String queryText) throws ParseException {
+ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
@@ -546,7 +599,7 @@ public class QueryParser {
source = analyzer.tokenStream(field, new StringReader(queryText));
}
CachingTokenFilter buffer = new CachingTokenFilter(source);
- CharTermAttribute termAtt = null;
+ TermToBytesRefAttribute termAtt = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
@@ -558,8 +611,8 @@ public class QueryParser {
// success==false if we hit an exception
}
if (success) {
- if (buffer.hasAttribute(CharTermAttribute.class)) {
- termAtt = buffer.getAttribute(CharTermAttribute.class);
+ if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
+ termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
}
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
@@ -601,33 +654,37 @@ public class QueryParser {
if (numTokens == 0)
return null;
else if (numTokens == 1) {
- String term = null;
+ BytesRef term = new BytesRef();
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
return newTermQuery(new Term(field, term));
} else {
- if (severalTokensAtSamePosition) {
- if (positionCount == 1) {
+ if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
+ if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
// no phrase query:
- BooleanQuery q = newBooleanQuery(true);
+ BooleanQuery q = newBooleanQuery(positionCount == 1);
+
+ BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ?
+ BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
+
for (int i = 0; i < numTokens; i++) {
- String term = null;
+ BytesRef term = new BytesRef();
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
Query currentQuery = newTermQuery(
new Term(field, term));
- q.add(currentQuery, BooleanClause.Occur.SHOULD);
+ q.add(currentQuery, occur);
}
return q;
}
@@ -638,12 +695,12 @@ public class QueryParser {
List<Term> multiTerms = new ArrayList<Term>();
int position = -1;
for (int i = 0; i < numTokens; i++) {
- String term = null;
+ BytesRef term = new BytesRef();
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
@@ -677,13 +734,13 @@ public class QueryParser {
for (int i = 0; i < numTokens; i++) {
- String term = null;
+ BytesRef term = new BytesRef();
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- term = termAtt.toString();
+ termAtt.toBytesRef(term);
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
@@ -706,7 +763,7 @@ public class QueryParser {
/**
- * Base implementation delegates to {@link #getFieldQuery(String,String)}.
+ * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}.
* This method may be overridden, for example, to return
* a SpanNearQuery instead of a PhraseQuery.
*
@@ -714,7 +771,7 @@ public class QueryParser {
*/
protected Query getFieldQuery(String field, String queryText, int slop)
throws ParseException {
- Query query = getFieldQuery(field, queryText);
+ Query query = hasNewAPI ? getFieldQuery(field, queryText, true) : getFieldQuery(field, queryText);
if (query instanceof PhraseQuery) {
((PhraseQuery) query).setSlop(slop);
@@ -1314,7 +1371,7 @@ Query Term(String field) : {
}
q = getFuzzyQuery(field, termImage,fms);
} else {
- q = getFieldQuery(field, termImage);
+ q = hasNewAPI ? getFieldQuery(field, termImage, false) : getFieldQuery(field, termImage);
}
}
| ( <RANGEIN_START> ( goop1=<RANGEIN_GOOP>|goop1=<RANGEIN_QUOTED> )
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java Thu Jul 22 19:34:35 2010
@@ -15,7 +15,7 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@@ -31,7 +31,9 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
/** Token Manager. */
public class QueryParserTokenManager implements QueryParserConstants
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java Thu Jul 22 19:34:35 2010
@@ -103,7 +103,7 @@ public class AutomatonTermsEnum extends
// build a cache of sorted transitions for every state
allTransitions = new Transition[runAutomaton.getSize()][];
for (State state : this.automaton.getNumberedStates()) {
- state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
+ state.sortTransitions(Transition.CompareByMinMaxThenDest);
state.trimTransitionsArray();
allTransitions[state.getNumber()] = state.transitionsArray;
}
@@ -158,11 +158,7 @@ public class AutomatonTermsEnum extends
// seek to the next possible string;
if (nextString()) {
// reposition
-
- // FIXME: this is really bad to turn off
- // but it cannot work correctly until terms are in utf8 order.
- linear = false;
-
+
if (linear)
setLinear(infinitePosition);
return seekBytesRef;
@@ -188,15 +184,15 @@ public class AutomatonTermsEnum extends
}
for (int i = 0; i < allTransitions[state].length; i++) {
Transition t = allTransitions[state][i];
- if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
- compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
+ if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
+ (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
maxInterval = t.getMax();
break;
}
}
- // 0xef terms don't get the optimization... not worth the trouble.
- if (maxInterval != 0xef)
- maxInterval = incrementUTF16(maxInterval);
+ // 0xff terms don't get the optimization... not worth the trouble.
+ if (maxInterval != 0xff)
+ maxInterval = incrementUTF8(maxInterval);
int length = position + 1; /* position + maxTransition */
if (linearUpperBound.bytes.length < length)
linearUpperBound.bytes = new byte[length];
@@ -281,7 +277,7 @@ public class AutomatonTermsEnum extends
// if the next character is U+FFFF and is not part of the useful portion,
// then by definition it puts us in a reject state, and therefore this
// path is dead. there cannot be any higher transitions. backtrack.
- c = incrementUTF16(c);
+ c = incrementUTF8(c);
if (c == -1)
return false;
}
@@ -295,8 +291,8 @@ public class AutomatonTermsEnum extends
for (int i = 0; i < transitions.length; i++) {
Transition transition = transitions[i];
- if (compareToUTF16(transition.getMax(), c) >= 0) {
- int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
+ if (transition.getMax() >= c) {
+ int nextChar = Math.max(c, transition.getMin());
// append either the next sequential char, or the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
@@ -342,9 +338,9 @@ public class AutomatonTermsEnum extends
private boolean backtrack(int position) {
while (position > 0) {
int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
- // if a character is 0xef its a dead-end too,
- // because there is no higher character in UTF-16 sort order.
- nextChar = incrementUTF16(nextChar);
+ // if a character is 0xff its a dead-end too,
+ // because there is no higher character in UTF-8 sort order.
+ nextChar = incrementUTF8(nextChar);
if (nextChar != -1) {
seekBytesRef.bytes[position - 1] = (byte) nextChar;
seekBytesRef.length = position;
@@ -355,34 +351,11 @@ public class AutomatonTermsEnum extends
return false; /* all solutions exhausted */
}
- /* return the next utf8 byte in utf16 order, or -1 if exhausted */
- private final int incrementUTF16(int utf8) {
+ /* return the next utf8 byte in utf8 order, or -1 if exhausted */
+ private final int incrementUTF8(int utf8) {
switch(utf8) {
- case 0xed: return 0xf0;
- case 0xfd: return 0xee;
- case 0xee: return 0xef;
- case 0xef: return -1;
+ case 0xff: return -1;
default: return utf8 + 1;
}
}
-
- int compareToUTF16(int aByte, int bByte) {
- if (aByte != bByte) {
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
- }
- return 0;
- }
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java Thu Jul 22 19:34:35 2010
@@ -18,39 +18,317 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
+import java.util.Arrays;
+
import org.apache.lucene.index.*;
-final class ExactPhraseScorer extends PhraseScorer {
+final class ExactPhraseScorer extends Scorer {
+ private final Weight weight;
+ private final byte[] norms;
+ private final float value;
+
+ private static final int SCORE_CACHE_SIZE = 32;
+ private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
+
+ private final int endMinus1;
+
+ private final static int CHUNK = 4096;
+
+ private int gen;
+ private final int[] counts = new int[CHUNK];
+ private final int[] gens = new int[CHUNK];
+
+ boolean noDocs;
- ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets,
- Similarity similarity, byte[] norms) {
- super(weight, postings, offsets, similarity, norms);
+ private final static class ChunkState {
+ final DocsAndPositionsEnum posEnum;
+ final int offset;
+ final boolean useAdvance;
+ int posUpto;
+ int posLimit;
+ int pos;
+ int lastPos;
+
+ public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
+ this.posEnum = posEnum;
+ this.offset = offset;
+ this.useAdvance = useAdvance;
+ }
+ }
+
+ private final ChunkState[] chunkStates;
+
+ private int docID = -1;
+ private int freq;
+
+ ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
+ Similarity similarity, byte[] norms) throws IOException {
+ super(similarity);
+ this.weight = weight;
+ this.norms = norms;
+ this.value = weight.getValue();
+
+ chunkStates = new ChunkState[postings.length];
+
+ endMinus1 = postings.length-1;
+
+ for(int i=0;i<postings.length;i++) {
+
+ // Coarse optimization: advance(target) is fairly
+ // costly, so, if the relative freq of the 2nd
+ // rarest term is not that much (> 1/5th) rarer than
+ // the first term, then we just use .nextDoc() when
+ // ANDing. This buys ~15% gain for phrases where
+ // freq of rarest 2 terms is close:
+ final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
+ chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
+ if (i > 0 && postings[i].postings.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+ noDocs = true;
+ return;
+ }
+ }
+
+ for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
+ scoreCache[i] = getSimilarity().tf((float) i) * value;
+ }
}
@Override
- protected final float phraseFreq() throws IOException {
- // sort list with pq
- pq.clear();
- for (PhrasePositions pp = first; pp != null; pp = pp.next) {
- pp.firstPosition();
- pq.add(pp); // build pq from list
- }
- pqToList(); // rebuild list from pq
-
- // for counting how many times the exact phrase is found in current document,
- // just count how many times all PhrasePosition's have exactly the same position.
- int freq = 0;
- do { // find position w/ all terms
- while (first.position < last.position) { // scan forward in first
- do {
- if (!first.nextPosition())
- return freq;
- } while (first.position < last.position);
- firstToLast();
- }
- freq++; // all equal: a match
- } while (last.nextPosition());
-
+ public int nextDoc() throws IOException {
+ while(true) {
+
+ // first (rarest) term
+ final int doc = chunkStates[0].posEnum.nextDoc();
+ if (doc == DocsEnum.NO_MORE_DOCS) {
+ docID = doc;
+ return doc;
+ }
+
+ // not-first terms
+ int i = 1;
+ while(i < chunkStates.length) {
+ final ChunkState cs = chunkStates[i];
+ int doc2 = cs.posEnum.docID();
+ if (cs.useAdvance) {
+ if (doc2 < doc) {
+ doc2 = cs.posEnum.advance(doc);
+ }
+ } else {
+ int iter = 0;
+ while(doc2 < doc) {
+ // safety net -- fallback to .advance if we've
+ // done too many .nextDocs
+ if (++iter == 50) {
+ doc2 = cs.posEnum.advance(doc);
+ break;
+ } else {
+ doc2 = cs.posEnum.nextDoc();
+ }
+ }
+ }
+ if (doc2 > doc) {
+ break;
+ }
+ i++;
+ }
+
+ if (i == chunkStates.length) {
+ // this doc has all the terms -- now test whether
+ // phrase occurs
+ docID = doc;
+
+ freq = phraseFreq();
+ if (freq != 0) {
+ return docID;
+ }
+ }
+ }
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+
+ // first term
+ int doc = chunkStates[0].posEnum.advance(target);
+ if (doc == DocsEnum.NO_MORE_DOCS) {
+ docID = DocsEnum.NO_MORE_DOCS;
+ return doc;
+ }
+
+ while(true) {
+
+ // not-first terms
+ int i = 1;
+ while(i < chunkStates.length) {
+ int doc2 = chunkStates[i].posEnum.docID();
+ if (doc2 < doc) {
+ doc2 = chunkStates[i].posEnum.advance(doc);
+ }
+ if (doc2 > doc) {
+ break;
+ }
+ i++;
+ }
+
+ if (i == chunkStates.length) {
+ // this doc has all the terms -- now test whether
+ // phrase occurs
+ docID = doc;
+ freq = phraseFreq();
+ if (freq != 0) {
+ return docID;
+ }
+ }
+
+ doc = chunkStates[0].posEnum.nextDoc();
+ if (doc == DocsEnum.NO_MORE_DOCS) {
+ docID = doc;
+ return doc;
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "ExactPhraseScorer(" + weight + ")";
+ }
+
+ // used by MultiPhraseQuery
+ float currentFreq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return docID;
+ }
+
+ @Override
+ public float score() throws IOException {
+ final float raw; // raw score
+ if (freq < SCORE_CACHE_SIZE) {
+ raw = scoreCache[freq];
+ } else {
+ raw = getSimilarity().tf((float) freq) * value;
+ }
+ return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
+ }
+
+ private int phraseFreq() throws IOException {
+
+ freq = 0;
+
+ // init chunks
+ for(int i=0;i<chunkStates.length;i++) {
+ final ChunkState cs = chunkStates[i];
+ cs.posLimit = cs.posEnum.freq();
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ cs.posUpto = 1;
+ cs.lastPos = -1;
+ }
+
+ int chunkStart = 0;
+ int chunkEnd = CHUNK;
+
+ // process chunk by chunk
+ boolean end = false;
+
+ // TODO: we could fold in chunkStart into offset and
+ // save one subtract per pos incr
+
+ while(!end) {
+
+ gen++;
+
+ if (gen == 0) {
+ // wraparound
+ Arrays.fill(gens, 0);
+ gen++;
+ }
+
+ // first term
+ {
+ final ChunkState cs = chunkStates[0];
+ while(cs.pos < chunkEnd) {
+ if (cs.pos > cs.lastPos) {
+ cs.lastPos = cs.pos;
+ final int posIndex = cs.pos - chunkStart;
+ counts[posIndex] = 1;
+ assert gens[posIndex] != gen;
+ gens[posIndex] = gen;
+ }
+
+ if (cs.posUpto == cs.posLimit) {
+ end = true;
+ break;
+ }
+ cs.posUpto++;
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ }
+ }
+
+ // middle terms
+ boolean any = true;
+ for(int t=1;t<endMinus1;t++) {
+ final ChunkState cs = chunkStates[t];
+ any = false;
+ while(cs.pos < chunkEnd) {
+ if (cs.pos > cs.lastPos) {
+ cs.lastPos = cs.pos;
+ final int posIndex = cs.pos - chunkStart;
+ if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
+ // viable
+ counts[posIndex]++;
+ any = true;
+ }
+ }
+
+ if (cs.posUpto == cs.posLimit) {
+ end = true;
+ break;
+ }
+ cs.posUpto++;
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ }
+
+ if (!any) {
+ break;
+ }
+ }
+
+ if (!any) {
+ // petered out for this chunk
+ chunkStart += CHUNK;
+ chunkEnd += CHUNK;
+ continue;
+ }
+
+ // last term
+
+ {
+ final ChunkState cs = chunkStates[endMinus1];
+ while(cs.pos < chunkEnd) {
+ if (cs.pos > cs.lastPos) {
+ cs.lastPos = cs.pos;
+ final int posIndex = cs.pos - chunkStart;
+ if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
+ freq++;
+ }
+ }
+
+ if (cs.posUpto == cs.posLimit) {
+ end = true;
+ break;
+ }
+ cs.posUpto++;
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ }
+ }
+
+ chunkStart += CHUNK;
+ chunkEnd += CHUNK;
+ }
+
return freq;
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java Thu Jul 22 19:34:35 2010
@@ -18,11 +18,13 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.document.NumericField; // for javadocs
import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
+import org.apache.lucene.util.packed.PackedInts;
import java.io.IOException;
import java.io.Serializable;
@@ -530,6 +532,12 @@ public interface FieldCache {
/** Number of documents */
public abstract int size();
+
+ /** Returns a TermsEnum that can iterate over the values in this index entry */
+ public abstract TermsEnum getTermsEnum();
+
+ /** @lucene.internal */
+ public abstract PackedInts.Reader getDocToOrd();
}
/** Checks the internal cache for an appropriate entry, and if none
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java Thu Jul 22 19:34:35 2010
@@ -19,17 +19,9 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.WeakHashMap;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.TermsEnum;
+import java.util.*;
+
+import org.apache.lucene.index.*;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.GrowableWriter;
@@ -642,7 +634,7 @@ class FieldCacheImpl implements FieldCac
}
}
- private static class DocTermsIndexImpl extends DocTermsIndex {
+ public static class DocTermsIndexImpl extends DocTermsIndex {
private final PagedBytes.Reader bytes;
private final PackedInts.Reader termOrdToBytesOffset;
private final PackedInts.Reader docToTermOrd;
@@ -656,6 +648,11 @@ class FieldCacheImpl implements FieldCac
}
@Override
+ public PackedInts.Reader getDocToOrd() {
+ return docToTermOrd;
+ }
+
+ @Override
public int numOrd() {
return numOrd;
}
@@ -674,6 +671,105 @@ class FieldCacheImpl implements FieldCac
public BytesRef lookup(int ord, BytesRef ret) {
return bytes.fillUsingLengthPrefix(ret, termOrdToBytesOffset.get(ord));
}
+
+ @Override
+ public TermsEnum getTermsEnum() {
+ return this.new DocTermsIndexEnum();
+ }
+
+ class DocTermsIndexEnum extends TermsEnum {
+ int currentOrd;
+ int currentBlockNumber;
+ int end; // end position in the current block
+ final byte[][] blocks;
+ final int[] blockEnds;
+
+ final BytesRef term = new BytesRef();
+
+ public DocTermsIndexEnum() {
+ currentOrd = 0;
+ currentBlockNumber = 0;
+ blocks = bytes.getBlocks();
+ blockEnds = bytes.getBlockEnds();
+ currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get(0));
+ end = blockEnds[currentBlockNumber];
+ }
+
+ @Override
+ public SeekStatus seek(BytesRef text, boolean useCache) throws IOException {
+ // TODO - we can support with binary search
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public SeekStatus seek(long ord) throws IOException {
+ assert(ord >= 0 && ord <= numOrd);
+ // TODO: if gap is small, could iterate from current position? Or let user decide that?
+ currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get((int)ord));
+ end = blockEnds[currentBlockNumber];
+ currentOrd = (int)ord;
+ return SeekStatus.FOUND;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ int start = term.offset + term.length;
+ if (start >= end) {
+ // switch byte blocks
+ if (currentBlockNumber +1 >= blocks.length) {
+ return null;
+ }
+ currentBlockNumber++;
+ term.bytes = blocks[currentBlockNumber];
+ end = blockEnds[currentBlockNumber];
+ start = 0;
+ if (end<=0) return null; // special case of empty last array
+ }
+
+ currentOrd++;
+
+ byte[] block = term.bytes;
+ if ((block[start] & 128) == 0) {
+ term.length = block[start];
+ term.offset = start+1;
+ } else {
+ term.length = (((int) (block[start] & 0x7f)) << 8) | (block[1+start] & 0xff);
+ term.offset = start+2;
+ }
+
+ return term;
+ }
+
+ @Override
+ public BytesRef term() throws IOException {
+ return term;
+ }
+
+ @Override
+ public long ord() throws IOException {
+ return currentOrd;
+ }
+
+ @Override
+ public int docFreq() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ }
}
private static boolean DEFAULT_FASTER_BUT_MORE_RAM = true;
@@ -706,6 +802,14 @@ class FieldCacheImpl implements FieldCac
int startTermsBPV;
int startNumUniqueTerms;
+ int maxDoc = reader.maxDoc();
+ final int termCountHardLimit;
+ if (maxDoc == Integer.MAX_VALUE) {
+ termCountHardLimit = Integer.MAX_VALUE;
+ } else {
+ termCountHardLimit = maxDoc+1;
+ }
+
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
@@ -717,11 +821,17 @@ class FieldCacheImpl implements FieldCac
numUniqueTerms = -1;
}
if (numUniqueTerms != -1) {
+
+ if (numUniqueTerms > termCountHardLimit) {
+ // app is misusing the API (there is more than
+ // one term per doc); in this case we make best
+ // effort to load what we can (see LUCENE-2142)
+ numUniqueTerms = termCountHardLimit;
+ }
+
startBytesBPV = PackedInts.bitsRequired(numUniqueTerms*4);
startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
- if (numUniqueTerms > Integer.MAX_VALUE-1) {
- throw new IllegalStateException("this field has too many (" + numUniqueTerms + ") unique terms");
- }
+
startNumUniqueTerms = (int) numUniqueTerms;
} else {
startBytesBPV = 1;
@@ -751,6 +861,10 @@ class FieldCacheImpl implements FieldCac
if (term == null) {
break;
}
+ if (termOrd >= termCountHardLimit) {
+ break;
+ }
+
if (termOrd == termOrdToBytesOffset.size()) {
// NOTE: this code only runs if the incoming
// reader impl doesn't implement
@@ -775,7 +889,7 @@ class FieldCacheImpl implements FieldCac
}
// maybe an int-only impl?
- return new DocTermsIndexImpl(bytes.freeze(), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd);
+ return new DocTermsIndexImpl(bytes.freeze(true), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd);
}
}
@@ -829,6 +943,8 @@ class FieldCacheImpl implements FieldCac
final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue();
+ final int termCountHardLimit = reader.maxDoc();
+
// Holds the actual term data, expanded.
final PagedBytes bytes = new PagedBytes(15);
@@ -845,6 +961,9 @@ class FieldCacheImpl implements FieldCac
numUniqueTerms = -1;
}
if (numUniqueTerms != -1) {
+ if (numUniqueTerms > termCountHardLimit) {
+ numUniqueTerms = termCountHardLimit;
+ }
startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
} else {
startBPV = 1;
@@ -859,10 +978,18 @@ class FieldCacheImpl implements FieldCac
bytes.copyUsingLengthPrefix(new BytesRef());
if (terms != null) {
+ int termCount = 0;
final TermsEnum termsEnum = terms.iterator();
final Bits delDocs = MultiFields.getDeletedDocs(reader);
DocsEnum docs = null;
while(true) {
+ if (termCount++ == termCountHardLimit) {
+ // app is misusing the API (there is more than
+ // one term per doc); in this case we make best
+ // effort to load what we can (see LUCENE-2142)
+ break;
+ }
+
final BytesRef term = termsEnum.next();
if (term == null) {
break;
@@ -880,7 +1007,7 @@ class FieldCacheImpl implements FieldCac
}
// maybe an int-only impl?
- return new DocTermsImpl(bytes.freeze(), docToOffset.getMutable());
+ return new DocTermsImpl(bytes.freeze(true), docToOffset.getMutable());
}
}
private volatile PrintStream infoStream;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java Thu Jul 22 19:34:35 2010
@@ -22,7 +22,6 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.index.TermDocs; // for javadocs
/**
* A {@link Filter} that only accepts documents whose single
@@ -70,7 +69,7 @@ import org.apache.lucene.index.TermDocs;
*
* In contrast, TermsFilter builds up an {@link OpenBitSet},
* keyed by docID, every time it's created, by enumerating
- * through all matching docs using {@link TermDocs} to seek
+ * through all matching docs using {@link DocsEnum} to seek
* and scan through each term's docID list. While there is
* no linear scan of all docIDs, besides the allocation of
* the underlying array in the {@link OpenBitSet}, this
@@ -96,13 +95,20 @@ import org.apache.lucene.index.TermDocs;
public class FieldCacheTermsFilter extends Filter {
private String field;
- private String[] terms;
+ private BytesRef[] terms;
- public FieldCacheTermsFilter(String field, String... terms) {
+ public FieldCacheTermsFilter(String field, BytesRef... terms) {
this.field = field;
this.terms = terms;
}
+ public FieldCacheTermsFilter(String field, String... terms) {
+ this.field = field;
+ this.terms = new BytesRef[terms.length];
+ for (int i = 0; i < terms.length; i++)
+ this.terms[i] = new BytesRef(terms[i]);
+ }
+
public FieldCache getFieldCache() {
return FieldCache.DEFAULT;
}
@@ -122,7 +128,7 @@ public class FieldCacheTermsFilter exten
openBitSet = new OpenBitSet(this.fcsi.size());
final BytesRef spare = new BytesRef();
for (int i=0;i<terms.length;i++) {
- int termNumber = this.fcsi.binarySearchLookup(new BytesRef(terms[i]), spare);
+ int termNumber = this.fcsi.binarySearchLookup(terms[i], spare);
if (termNumber > 0) {
openBitSet.fastSet(termNumber);
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java Thu Jul 22 19:34:35 2010
@@ -31,6 +31,7 @@ import org.apache.lucene.search.FieldCac
import org.apache.lucene.search.FieldCache.DocTermsIndex;
import org.apache.lucene.search.FieldCache.DocTerms;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.packed.PackedInts;
/**
* Expert: a FieldComparator compares hits so as to determine their
@@ -709,23 +710,21 @@ public abstract class FieldComparator {
private final BytesRef[] values;
private final int[] readerGen;
+ private PackedInts.Reader currentDocToOrd;
private int currentReaderGen = -1;
private DocTermsIndex termsIndex;
private final String field;
private int bottomSlot = -1;
private int bottomOrd;
+ private boolean bottomSameReader;
private BytesRef bottomValue;
- private final boolean reversed;
- private final int sortPos;
private final BytesRef tempBR = new BytesRef();
public TermOrdValComparator(int numHits, String field, int sortPos, boolean reversed) {
ords = new int[numHits];
values = new BytesRef[numHits];
readerGen = new int[numHits];
- this.sortPos = sortPos;
- this.reversed = reversed;
this.field = field;
}
@@ -754,59 +753,38 @@ public abstract class FieldComparator {
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
- int order = termsIndex.getOrd(doc);
- final int cmp = bottomOrd - order;
- if (cmp != 0) {
- return cmp;
- }
-
- if (bottomValue == null) {
- if (order == 0) {
- // unset
- return 0;
+ if (bottomSameReader) {
+ // ord is precisely comparable, even in the equal case
+ return bottomOrd - (int) currentDocToOrd.get(doc);
+ } else {
+ // ord is only approx comparable: if they are not
+ // equal, we can use that; if they are equal, we
+ // must fallback to compare by value
+ final int order = (int) currentDocToOrd.get(doc);
+ final int cmp = bottomOrd - order;
+ if (cmp != 0) {
+ return cmp;
}
- // bottom wins
- return -1;
- } else if (order == 0) {
- // doc wins
- return 1;
- }
- termsIndex.lookup(order, tempBR);
- return bottomValue.compareTo(tempBR);
- }
- private void convert(int slot) {
- readerGen[slot] = currentReaderGen;
- int index = 0;
- BytesRef value = values[slot];
- if (value == null) {
- // 0 ord is null for all segments
- assert ords[slot] == 0;
- return;
- }
-
- if (sortPos == 0 && bottomSlot != -1 && bottomSlot != slot) {
- // Since we are the primary sort, the entries in the
- // queue are bounded by bottomOrd:
- if (reversed) {
- index = binarySearch(tempBR, termsIndex, value, bottomOrd, termsIndex.numOrd()-1);
- } else {
- index = binarySearch(tempBR, termsIndex, value, 0, bottomOrd);
+ if (bottomValue == null) {
+ if (order == 0) {
+ // unset
+ return 0;
+ }
+ // bottom wins
+ return -1;
+ } else if (order == 0) {
+ // doc wins
+ return 1;
}
- } else {
- // Full binary search
- index = binarySearch(tempBR, termsIndex, value);
- }
-
- if (index < 0) {
- index = -index - 2;
+ termsIndex.lookup(order, tempBR);
+ return bottomValue.compareTo(tempBR);
}
- ords[slot] = index;
}
@Override
public void copy(int slot, int doc) {
- final int ord = termsIndex.getOrd(doc);
+ final int ord = (int) currentDocToOrd.get(doc);
if (ord == 0) {
values[slot] = null;
} else {
@@ -823,21 +801,34 @@ public abstract class FieldComparator {
@Override
public void setNextReader(IndexReader reader, int docBase) throws IOException {
termsIndex = FieldCache.DEFAULT.getTermsIndex(reader, field);
+ currentDocToOrd = termsIndex.getDocToOrd();
currentReaderGen++;
if (bottomSlot != -1) {
- convert(bottomSlot);
- bottomOrd = ords[bottomSlot];
+ setBottom(bottomSlot);
}
}
@Override
public void setBottom(final int bottom) {
bottomSlot = bottom;
- if (readerGen[bottom] != currentReaderGen) {
- convert(bottomSlot);
+
+ bottomValue = values[bottomSlot];
+ if (bottomValue == null) {
+ // 0 ord is null for all segments
+ assert ords[bottomSlot] == 0;
+ bottomOrd = 0;
+ bottomSameReader = true;
+ } else {
+ final int index = binarySearch(tempBR, termsIndex, bottomValue);
+ if (index < 0) {
+ bottomOrd = -index - 2;
+ bottomSameReader = false;
+ } else {
+ bottomOrd = index;
+ // exact value match
+ bottomSameReader = true;
+ }
}
- bottomOrd = ords[bottom];
- bottomValue = values[bottom];
}
@Override
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java Thu Jul 22 19:34:35 2010
@@ -130,14 +130,6 @@ public class FuzzyQuery extends MultiTer
return prefixLength;
}
- @Override @Deprecated
- protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
- if (!termLongEnough) { // can only match if it's exact
- return new SingleTermEnum(reader, term);
- }
- return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
- }
-
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
if (!termLongEnough) { // can only match if it's exact
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Jul 22 19:34:35 2010
@@ -33,7 +33,7 @@ import org.apache.lucene.util.automaton.
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import java.io.IOException;
-import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
@@ -135,7 +135,7 @@ public final class FuzzyTermsEnum extend
LevenshteinAutomata builder =
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength));
- runAutomata = new ArrayList<ByteRunAutomaton>(maxDistance);
+ final ByteRunAutomaton[] ra = new ByteRunAutomaton[maxDistance + 1];
for (int i = 0; i <= maxDistance; i++) {
Automaton a = builder.toAutomaton(i);
// constant prefix
@@ -144,8 +144,9 @@ public final class FuzzyTermsEnum extend
UnicodeUtil.newString(termText, 0, realPrefixLength));
a = BasicOperations.concatenate(prefix, a);
}
- runAutomata.add(new ByteRunAutomaton(a));
+ ra[i] = new ByteRunAutomaton(a);
}
+ runAutomata = Arrays.asList(ra);
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java Thu Jul 22 19:34:35 2010
@@ -171,32 +171,64 @@ public class MultiPhraseQuery extends Qu
if (termArrays.size() == 0) // optimize zero-term case
return null;
- DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[termArrays.size()];
- for (int i=0; i<postings.length; i++) {
+ final Bits delDocs = MultiFields.getDeletedDocs(reader);
+
+ PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[termArrays.size()];
+
+ for (int i=0; i<postingsFreqs.length; i++) {
Term[] terms = termArrays.get(i);
final DocsAndPositionsEnum postingsEnum;
+ int docFreq;
+
if (terms.length > 1) {
postingsEnum = new UnionDocsAndPositionsEnum(reader, terms);
+
+ // coarse -- this overcounts since a given doc can
+ // have more than one terms:
+ docFreq = 0;
+ for(int j=0;j<terms.length;j++) {
+ docFreq += reader.docFreq(terms[i]);
+ }
} else {
- postingsEnum = reader.termPositionsEnum(MultiFields.getDeletedDocs(reader),
+ final BytesRef text = new BytesRef(terms[0].text());
+ postingsEnum = reader.termPositionsEnum(delDocs,
terms[0].field(),
- new BytesRef(terms[0].text()));
- }
+ text);
- if (postingsEnum == null) {
- return null;
+ if (postingsEnum == null) {
+ if (MultiFields.getTermDocsEnum(reader, delDocs, terms[0].field(), text) != null) {
+ // term does exist, but has no positions
+ throw new IllegalStateException("field \"" + terms[0].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[0].text() + ")");
+ } else {
+ // term does not exist
+ return null;
+ }
+ }
+
+ docFreq = reader.docFreq(terms[0].field(), text);
}
- postings[i] = postingsEnum;
+ postingsFreqs[i] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(i).intValue());
}
- if (slop == 0)
- return new ExactPhraseScorer(this, postings, getPositions(), similarity,
- reader.norms(field));
- else
- return new SloppyPhraseScorer(this, postings, getPositions(), similarity,
+ // sort by increasing docFreq order
+ if (slop == 0) {
+ Arrays.sort(postingsFreqs);
+ }
+
+ if (slop == 0) {
+ ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity,
+ reader.norms(field));
+ if (s.noDocs) {
+ return null;
+ } else {
+ return s;
+ }
+ } else {
+ return new SloppyPhraseScorer(this, postingsFreqs, similarity,
slop, reader.norms(field));
+ }
}
@Override
@@ -231,13 +263,24 @@ public class MultiPhraseQuery extends Qu
fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+
"), product of:");
- PhraseScorer scorer = (PhraseScorer) scorer(reader, true, false);
+ Scorer scorer = (Scorer) scorer(reader, true, false);
if (scorer == null) {
return new Explanation(0.0f, "no matching docs");
}
+
Explanation tfExplanation = new Explanation();
int d = scorer.advance(doc);
- float phraseFreq = (d == doc) ? scorer.currentFreq() : 0.0f;
+ float phraseFreq;
+ if (d == doc) {
+ if (slop == 0) {
+ phraseFreq = ((ExactPhraseScorer) scorer).currentFreq();
+ } else {
+ phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq();
+ }
+ } else {
+ phraseFreq = 0.0f;
+ }
+
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
@@ -293,7 +336,7 @@ public class MultiPhraseQuery extends Qu
@Override
public final String toString(String f) {
StringBuilder buffer = new StringBuilder();
- if (!field.equals(f)) {
+ if (field == null || !field.equals(f)) {
buffer.append(field);
buffer.append(":");
}
@@ -458,9 +501,14 @@ class UnionDocsAndPositionsEnum extends
for (int i = 0; i < terms.length; i++) {
DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs,
terms[i].field(),
- new BytesRef(terms[i].text()));
+ terms[i].bytes());
if (postings != null) {
docsEnums.add(postings);
+ } else {
+ if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), terms[i].bytes()) != null) {
+ // term does exist, but has no positions
+ throw new IllegalStateException("field \"" + terms[i].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[i].text() + ")");
+ }
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java Thu Jul 22 19:34:35 2010
@@ -32,7 +32,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.VirtualMethod;
+import org.apache.lucene.util.PagedBytes;
/**
* An abstract {@link Query} that matches documents
@@ -72,17 +72,6 @@ public abstract class MultiTermQuery ext
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
transient int numberOfTerms = 0;
- /** @deprecated remove when getEnum is removed */
- private static final VirtualMethod<MultiTermQuery> getEnumMethod =
- new VirtualMethod<MultiTermQuery>(MultiTermQuery.class, "getEnum", IndexReader.class);
- /** @deprecated remove when getEnum is removed */
- private static final VirtualMethod<MultiTermQuery> getTermsEnumMethod =
- new VirtualMethod<MultiTermQuery>(MultiTermQuery.class, "getTermsEnum", IndexReader.class);
- /** @deprecated remove when getEnum is removed */
- final boolean hasNewAPI =
- VirtualMethod.compareImplementationDistance(getClass(),
- getTermsEnumMethod, getEnumMethod) >= 0; // its ok for both to be overridden
-
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum}
* and update the boost on each returned term. This enables to control the boost factor
* for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or
@@ -189,77 +178,49 @@ public abstract class MultiTermQuery ext
private abstract static class BooleanQueryRewrite extends RewriteMethod {
protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
+ final Fields fields = MultiFields.getFields(reader);
+ if (fields == null) {
+ // reader has no fields
+ return 0;
+ }
- if (query.hasNewAPI) {
-
- if (query.field == null) {
- throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
- }
-
- final Fields fields = MultiFields.getFields(reader);
- if (fields == null) {
- // reader has no fields
- return 0;
- }
-
- final Terms terms = fields.terms(query.field);
- if (terms == null) {
- // field does not exist
- return 0;
- }
+ final Terms terms = fields.terms(query.field);
+ if (terms == null) {
+ // field does not exist
+ return 0;
+ }
- final TermsEnum termsEnum = query.getTermsEnum(reader);
- assert termsEnum != null;
+ final TermsEnum termsEnum = query.getTermsEnum(reader);
+ assert termsEnum != null;
- if (termsEnum == TermsEnum.EMPTY)
- return 0;
- final BoostAttribute boostAtt =
- termsEnum.attributes().addAttribute(BoostAttribute.class);
- collector.boostAtt = boostAtt;
- int count = 0;
- BytesRef term;
- final Term placeholderTerm = new Term(query.field);
- while ((term = termsEnum.next()) != null) {
- if (collector.collect(placeholderTerm.createTerm(term.utf8ToString()), boostAtt.getBoost())) {
- count++;
- } else {
- break;
- }
- }
- collector.boostAtt = null;
- return count;
- } else {
- // deprecated case
- final FilteredTermEnum enumerator = query.getEnum(reader);
- int count = 0;
- try {
- do {
- Term t = enumerator.term();
- if (t != null) {
- if (collector.collect(t, enumerator.difference())) {
- count++;
- } else {
- break;
- }
- }
- } while (enumerator.next());
- } finally {
- enumerator.close();
+ if (termsEnum == TermsEnum.EMPTY)
+ return 0;
+ final BoostAttribute boostAtt =
+ termsEnum.attributes().addAttribute(BoostAttribute.class);
+ collector.boostAtt = boostAtt;
+ int count = 0;
+ BytesRef bytes;
+ while ((bytes = termsEnum.next()) != null) {
+ if (collector.collect(bytes, boostAtt.getBoost())) {
+ count++;
+ } else {
+ break;
}
- return count;
}
+ collector.boostAtt = null;
+ return count;
}
protected static abstract class TermCollector {
- /** this field is only set if a boostAttribute is used (e.g. {@link FuzzyTermsEnum}) */
private BoostAttribute boostAtt = null;
/** return false to stop collecting */
- public abstract boolean collect(Term t, float boost) throws IOException;
+ public abstract boolean collect(BytesRef bytes, float boost) throws IOException;
/** set the minimum boost as a hint for the term producer */
protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
- if (boostAtt != null) boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
+ assert boostAtt != null;
+ boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
}
}
}
@@ -268,9 +229,11 @@ public abstract class MultiTermQuery ext
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final BooleanQuery result = new BooleanQuery(true);
+ final Term placeholderTerm = new Term(query.field);
query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
- public boolean collect(Term t, float boost) {
- TermQuery tq = new TermQuery(t); // found a match
+ public boolean collect(BytesRef bytes, float boost) {
+ // add new TQ, we must clone the term, else it may get overwritten!
+ TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)));
tq.setBoost(query.getBoost() * boost); // set the boost
result.add(tq, BooleanClause.Occur.SHOULD); // add to query
return true;
@@ -331,16 +294,16 @@ public abstract class MultiTermQuery ext
protected abstract Query getQuery(Term term);
@Override
- public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
+ public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() {
- public boolean collect(Term t, float boost) {
+ public boolean collect(BytesRef bytes, float boost) {
// ignore uncompetetive hits
if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost)
return true;
- // add new entry in PQ
- st.term = t;
+ // add new entry in PQ, we must clone the term, else it may get overwritten!
+ st.bytes.copy(bytes);
st.boost = boost;
stQueue.offer(st);
// possibly drop entries from queue
@@ -353,9 +316,11 @@ public abstract class MultiTermQuery ext
private ScoreTerm st = new ScoreTerm();
});
+ final Term placeholderTerm = new Term(query.field);
final BooleanQuery bq = new BooleanQuery(true);
for (final ScoreTerm st : stQueue) {
- Query tq = getQuery(st.term); // found a match
+ // add new query, we must clone the term, else it may get overwritten!
+ Query tq = getQuery(placeholderTerm.createTerm(st.bytes));
tq.setBoost(query.getBoost() * st.boost); // set the boost
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
@@ -382,12 +347,13 @@ public abstract class MultiTermQuery ext
}
private static class ScoreTerm implements Comparable<ScoreTerm> {
- public Term term;
+ public final BytesRef bytes = new BytesRef();
public float boost;
public int compareTo(ScoreTerm other) {
if (this.boost == other.boost)
- return other.term.compareTo(this.term);
+ // TODO: is it OK to use default compare here?
+ return other.bytes.compareTo(this.bytes);
else
return Float.compare(this.boost, other.boost);
}
@@ -564,58 +530,67 @@ public abstract class MultiTermQuery ext
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
- final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit);
+ final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit);
collectTerms(reader, query, col);
if (col.hasCutOff) {
return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
+ } else if (col.termCount == 0) {
+ return new BooleanQuery(true);
} else {
- final Query result;
- if (col.pendingTerms.isEmpty()) {
- result = new BooleanQuery(true);
- } else {
- BooleanQuery bq = new BooleanQuery(true);
- for(Term term : col.pendingTerms) {
- TermQuery tq = new TermQuery(term);
- bq.add(tq, BooleanClause.Occur.SHOULD);
+ final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false);
+ try {
+ final BooleanQuery bq = new BooleanQuery(true);
+ final Term placeholderTerm = new Term(query.field);
+ long start = col.startOffset;
+ for(int i = 0; i < col.termCount; i++) {
+ final BytesRef bytes = new BytesRef();
+ start = bytesReader.fillUsingLengthPrefix3(bytes, start);
+ bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD);
}
// Strip scores
- result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
+ final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
+ query.incTotalNumberOfTerms(col.termCount);
+ return result;
+ } finally {
+ bytesReader.close();
}
- query.incTotalNumberOfTerms(col.pendingTerms.size());
- return result;
}
}
private static final class CutOffTermCollector extends TermCollector {
- CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) {
+ CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) {
this.reader = reader;
+ this.field = field;
this.docCountCutoff = docCountCutoff;
this.termCountLimit = termCountLimit;
}
- public boolean collect(Term t, float boost) throws IOException {
- pendingTerms.add(t);
- if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
+ public boolean collect(BytesRef bytes, float boost) throws IOException {
+ termCount++;
+ if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
+ pendingTerms.copyUsingLengthPrefix(bytes);
// Loading the TermInfo from the terms dict here
// should not be costly, because 1) the
// query/filter will load the TermInfo when it
// runs, and 2) the terms dict has a cache:
- // @deprecated: in 4.0 use BytesRef for collectTerms()
- docVisitCount += reader.docFreq(t);
+ docVisitCount += reader.docFreq(field, bytes);
return true;
}
int docVisitCount = 0;
boolean hasCutOff = false;
+ int termCount = 0;
final IndexReader reader;
+ final String field;
final int docCountCutoff, termCountLimit;
- final ArrayList<Term> pendingTerms = new ArrayList<Term>();
+ final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB
+ final long startOffset = pendingTerms.getPointer();
}
@Override
@@ -681,42 +656,20 @@ public abstract class MultiTermQuery ext
*/
public MultiTermQuery(final String field) {
this.field = field;
- }
-
- /**
- * Constructs a query matching terms that cannot be represented with a single
- * Term.
- * @deprecated Use {@link #MultiTermQuery(String)}, as the flex branch can
- * only work on one field per terms enum. If you override
- * {@link #getTermsEnum(IndexReader)}, you cannot use this ctor.
- */
- @Deprecated
- public MultiTermQuery() {
- this(null);
+ assert field != null;
}
/** Returns the field name for this query */
public final String getField() { return field; }
/** Construct the enumeration to be used, expanding the
- * pattern term.
- * @deprecated Please override {@link #getTermsEnum} instead */
- @Deprecated
- protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- /** Construct the enumeration to be used, expanding the
* pattern term. This method should only be called if
* the field exists (ie, implementations can assume the
* field does exist). This method should not return null
* (should instead return {@link TermsEnum#EMPTY} if no
* terms match). The TermsEnum must already be
* positioned to the first matching term. */
- // TODO 4.0: make this method abstract
- protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
- throw new UnsupportedOperationException();
- }
+ protected abstract TermsEnum getTermsEnum(IndexReader reader) throws IOException;
/**
* Expert: Return the number of unique terms visited during execution of the query.
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java Thu Jul 22 19:34:35 2010
@@ -20,11 +20,8 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.MultiFields;
@@ -109,97 +106,50 @@ public class MultiTermQueryWrapperFilter
*/
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
- if (query.hasNewAPI) {
- if (query.field == null) {
- throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
- }
-
- final Fields fields = MultiFields.getFields(reader);
- if (fields == null) {
- // reader has no fields
- return DocIdSet.EMPTY_DOCIDSET;
- }
-
- final Terms terms = fields.terms(query.field);
- if (terms == null) {
- // field does not exist
- return DocIdSet.EMPTY_DOCIDSET;
- }
-
- final TermsEnum termsEnum = query.getTermsEnum(reader);
- assert termsEnum != null;
- if (termsEnum.next() != null) {
- // fill into a OpenBitSet
- final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
- int termCount = 0;
- final Bits delDocs = MultiFields.getDeletedDocs(reader);
- DocsEnum docsEnum = null;
- do {
- termCount++;
- // System.out.println(" iter termCount=" + termCount + " term=" +
- // enumerator.term().toBytesString());
- docsEnum = termsEnum.docs(delDocs, docsEnum);
- final DocsEnum.BulkReadResult result = docsEnum.getBulkResult();
- while (true) {
- final int count = docsEnum.read();
- if (count != 0) {
- final int[] docs = result.docs.ints;
- for (int i = 0; i < count; i++) {
- bitSet.set(docs[i]);
- }
- } else {
- break;
+ final Fields fields = MultiFields.getFields(reader);
+ if (fields == null) {
+ // reader has no fields
+ return DocIdSet.EMPTY_DOCIDSET;
+ }
+
+ final Terms terms = fields.terms(query.field);
+ if (terms == null) {
+ // field does not exist
+ return DocIdSet.EMPTY_DOCIDSET;
+ }
+
+ final TermsEnum termsEnum = query.getTermsEnum(reader);
+ assert termsEnum != null;
+ if (termsEnum.next() != null) {
+ // fill into a OpenBitSet
+ final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
+ int termCount = 0;
+ final Bits delDocs = MultiFields.getDeletedDocs(reader);
+ DocsEnum docsEnum = null;
+ do {
+ termCount++;
+ // System.out.println(" iter termCount=" + termCount + " term=" +
+ // enumerator.term().toBytesString());
+ docsEnum = termsEnum.docs(delDocs, docsEnum);
+ final DocsEnum.BulkReadResult result = docsEnum.getBulkResult();
+ while (true) {
+ final int count = docsEnum.read();
+ if (count != 0) {
+ final int[] docs = result.docs.ints;
+ for (int i = 0; i < count; i++) {
+ bitSet.set(docs[i]);
}
+ } else {
+ break;
}
- } while (termsEnum.next() != null);
- // System.out.println(" done termCount=" + termCount);
+ }
+ } while (termsEnum.next() != null);
+ // System.out.println(" done termCount=" + termCount);
- query.incTotalNumberOfTerms(termCount);
- return bitSet;
- } else {
- return DocIdSet.EMPTY_DOCIDSET;
- }
+ query.incTotalNumberOfTerms(termCount);
+ return bitSet;
} else {
- final TermEnum enumerator = query.getEnum(reader);
- try {
- // if current term in enum is null, the enum is empty -> shortcut
- if (enumerator.term() == null)
- return DocIdSet.EMPTY_DOCIDSET;
- // else fill into a OpenBitSet
- final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
- final int[] docs = new int[32];
- final int[] freqs = new int[32];
- TermDocs termDocs = reader.termDocs();
- try {
- int termCount = 0;
- do {
- Term term = enumerator.term();
- if (term == null)
- break;
- termCount++;
- termDocs.seek(term);
- while (true) {
- final int count = termDocs.read(docs, freqs);
- if (count != 0) {
- for (int i = 0; i < count; i++) {
- bitSet.set(docs[i]);
- }
- } else {
- break;
- }
- }
- } while (enumerator.next());
-
- query.incTotalNumberOfTerms(termCount);
-
- } finally {
- termDocs.close();
- }
- return bitSet;
- } finally {
- enumerator.close();
- }
+ return DocIdSet.EMPTY_DOCIDSET;
}
}
-
}
Propchange: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Thu Jul 22 19:34:35 2010
@@ -1,4 +1,5 @@
-/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:943137,949730
+/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:943137,949730,957490,960490,961612
+/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:953476-966816
/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:824912-931101
/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:909334,948516
/lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:924483-924731,924781,925176-925462