You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2011/03/15 22:35:35 UTC
svn commit: r1081952 [8/17] - in /lucene/dev/branches/bulkpostings: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/
dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/
dev-tools/idea/lucene/contrib/demo/ dev-tools/idea/luce...
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/function/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/function/package.html?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/function/package.html (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/function/package.html Tue Mar 15 21:35:17 2011
@@ -130,14 +130,14 @@
Using field (byte) values to as scores:
<p>
Indexing:
- <pre>
+ <pre class="prettyprint">
f = new Field("score", "7", Field.Store.NO, Field.Index.UN_TOKENIZED);
f.setOmitNorms(true);
d1.add(f);
</pre>
<p>
Search:
- <pre>
+ <pre class="prettyprint">
Query q = new FieldScoreQuery("score", FieldScoreQuery.Type.BYTE);
</pre>
Document d1 above would get a score of 7.
@@ -148,7 +148,7 @@
<p>
Dividing the original score of each document by a square root of its docid
(just to demonstrate what it takes to manipulate scores this way)
- <pre>
+ <pre class="prettyprint">
Query q = queryParser.parse("my query text");
CustomScoreQuery customQ = new CustomScoreQuery(q) {
public float customScore(int doc, float subQueryScore, float valSrcScore) {
@@ -158,7 +158,7 @@
</pre>
<p>
For more informative debug info on the custom query, also override the name() method:
- <pre>
+ <pre class="prettyprint">
CustomScoreQuery customQ = new CustomScoreQuery(q) {
public float customScore(int doc, float subQueryScore, float valSrcScore) {
return subQueryScore / Math.sqrt(docid);
@@ -171,7 +171,7 @@
<p>
Taking the square root of the original score and multiplying it by a "short field driven score", ie, the
short value that was indexed for the scored doc in a certain field:
- <pre>
+ <pre class="prettyprint">
Query q = queryParser.parse("my query text");
FieldScoreQuery qf = new FieldScoreQuery("shortScore", FieldScoreQuery.Type.SHORT);
CustomScoreQuery customQ = new CustomScoreQuery(q,qf) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java Tue Mar 15 21:35:17 2011
@@ -53,7 +53,7 @@ public class NearSpansUnordered extends
private class CellQueue extends PriorityQueue<SpansCell> {
public CellQueue(int size) {
- initialize(size);
+ super(size);
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java Tue Mar 15 21:35:17 2011
@@ -145,7 +145,7 @@ public class SpanOrQuery extends SpanQue
private class SpanQueue extends PriorityQueue<Spans> {
public SpanQueue(int size) {
- initialize(size);
+ super(size);
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/package.html?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/package.html (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/spans/package.html Tue Mar 15 21:35:17 2011
@@ -59,7 +59,7 @@ two starts and ends at the greater of th
<p>For example, a span query which matches "John Kerry" within ten
words of "George Bush" within the first 100 words of the document
could be constructed with:
-<pre>
+<pre class="prettyprint">
SpanQuery john = new SpanTermQuery(new Term("content", "john"));
SpanQuery kerry = new SpanTermQuery(new Term("content", "kerry"));
SpanQuery george = new SpanTermQuery(new Term("content", "george"));
@@ -82,7 +82,7 @@ SpanQuery johnKerryNearGeorgeBushAtStart
So, for example, the above query can be restricted to documents which
also use the word "iraq" with:
-<pre>
+<pre class="prettyprint">
Query query = new BooleanQuery();
query.add(johnKerryNearGeorgeBushAtStart, true, false);
query.add(new TermQuery("content", "iraq"), true, false);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/store/DataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/store/DataOutput.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/store/DataOutput.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/store/DataOutput.java Tue Mar 15 21:35:17 2011
@@ -82,7 +82,7 @@ public abstract class DataOutput {
writeInt((int) i);
}
- /** Writes an long in a variable-length format. Writes between one and five
+ /** Writes an long in a variable-length format. Writes between one and nine
* bytes. Smaller values take fewer bytes. Negative numbers are not
* supported.
* @see DataInput#readVLong()
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/AttributeSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/AttributeSource.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/AttributeSource.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/AttributeSource.java Tue Mar 15 21:35:17 2011
@@ -131,14 +131,14 @@ public class AttributeSource {
/**
* returns the used AttributeFactory.
*/
- public AttributeFactory getAttributeFactory() {
+ public final AttributeFactory getAttributeFactory() {
return this.factory;
}
/** Returns a new iterator that iterates the attribute classes
* in the same order they were added in.
*/
- public Iterator<Class<? extends Attribute>> getAttributeClassesIterator() {
+ public final Iterator<Class<? extends Attribute>> getAttributeClassesIterator() {
return Collections.unmodifiableSet(attributes.keySet()).iterator();
}
@@ -146,7 +146,7 @@ public class AttributeSource {
* This iterator may contain less entries that {@link #getAttributeClassesIterator},
* if one instance implements more than one Attribute interface.
*/
- public Iterator<AttributeImpl> getAttributeImplsIterator() {
+ public final Iterator<AttributeImpl> getAttributeImplsIterator() {
if (hasAttributes()) {
if (currentState == null) {
computeCurrentState();
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/BytesRefHash.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/BytesRefHash.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/BytesRefHash.java Tue Mar 15 21:35:17 2011
@@ -117,7 +117,7 @@ public final class BytesRefHash {
*/
public BytesRef get(int ord, BytesRef ref) {
assert bytesStart != null : "bytesStart is null - not initialized";
- assert ord < bytesStart.length: "ord exceeeds byteStart len: " + bytesStart.length;
+ assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
return pool.setBytesRef(ref, bytesStart[ord]);
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java Tue Mar 15 21:35:17 2011
@@ -19,6 +19,7 @@ package org.apache.lucene.util;
import java.nio.CharBuffer;
import java.nio.ByteBuffer;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc
/**
* Provides support for converting byte sequences to Strings and back again.
@@ -39,7 +40,10 @@ import java.nio.ByteBuffer;
* <p/>
*
* @lucene.experimental
+ * @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
+ * instead. This class will be removed in Lucene 5.0
*/
+@Deprecated
public final class IndexableBinaryStringTools {
private static final CodingCase[] CODING_CASES = {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/PriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/PriorityQueue.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/PriorityQueue.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/PriorityQueue.java Tue Mar 15 21:35:17 2011
@@ -28,11 +28,57 @@ package org.apache.lucene.util;
*/
public abstract class PriorityQueue<T> {
private int size;
- private int maxSize;
- protected T[] heap;
+ private final int maxSize;
+ private final T[] heap;
+
+ public PriorityQueue(int maxSize) {
+ this(maxSize, true);
+ }
+
+ @SuppressWarnings("unchecked")
+ public PriorityQueue(int maxSize, boolean prepopulate) {
+ size = 0;
+ int heapSize;
+ if (0 == maxSize)
+ // We allocate 1 extra to avoid if statement in top()
+ heapSize = 2;
+ else {
+ if (maxSize == Integer.MAX_VALUE) {
+ // Don't wrap heapSize to -1, in this case, which
+ // causes a confusing NegativeArraySizeException.
+ // Note that very likely this will simply then hit
+ // an OOME, but at least that's more indicative to
+ // caller that this values is too big. We don't +1
+ // in this case, but it's very unlikely in practice
+ // one will actually insert this many objects into
+ // the PQ:
+ heapSize = Integer.MAX_VALUE;
+ } else {
+ // NOTE: we add +1 because all access to heap is
+ // 1-based not 0-based. heap[0] is unused.
+ heapSize = maxSize + 1;
+ }
+ }
+ heap = (T[]) new Object[heapSize]; // T is unbounded type, so this unchecked cast works always
+ this.maxSize = maxSize;
+
+ if (prepopulate) {
+ // If sentinel objects are supported, populate the queue with them
+ T sentinel = getSentinelObject();
+ if (sentinel != null) {
+ heap[1] = sentinel;
+ for (int i = 2; i < heap.length; i++) {
+ heap[i] = getSentinelObject();
+ }
+ size = maxSize;
+ }
+ }
+ }
/** Determines the ordering of objects in this priority queue. Subclasses
- must define this one method. */
+ * must define this one method.
+ * @return <code>true</code> iff parameter <tt>a</tt> is less than parameter <tt>b</tt>.
+ */
protected abstract boolean lessThan(T a, T b);
/**
@@ -78,45 +124,6 @@ public abstract class PriorityQueue<T> {
return null;
}
- /** Subclass constructors must call this. */
- @SuppressWarnings("unchecked")
- protected final void initialize(int maxSize) {
- size = 0;
- int heapSize;
- if (0 == maxSize)
- // We allocate 1 extra to avoid if statement in top()
- heapSize = 2;
- else {
- if (maxSize == Integer.MAX_VALUE) {
- // Don't wrap heapSize to -1, in this case, which
- // causes a confusing NegativeArraySizeException.
- // Note that very likely this will simply then hit
- // an OOME, but at least that's more indicative to
- // caller that this values is too big. We don't +1
- // in this case, but it's very unlikely in practice
- // one will actually insert this many objects into
- // the PQ:
- heapSize = Integer.MAX_VALUE;
- } else {
- // NOTE: we add +1 because all access to heap is
- // 1-based not 0-based. heap[0] is unused.
- heapSize = maxSize + 1;
- }
- }
- heap = (T[]) new Object[heapSize]; // T is unbounded type, so this unchecked cast works always
- this.maxSize = maxSize;
-
- // If sentinel objects are supported, populate the queue with them
- T sentinel = getSentinelObject();
- if (sentinel != null) {
- heap[1] = sentinel;
- for (int i = 2; i < heap.length; i++) {
- heap[i] = getSentinelObject();
- }
- size = maxSize;
- }
- }
-
/**
* Adds an Object to a PriorityQueue in log(size) time. If one tries to add
* more objects than maxSize from initialize an
@@ -245,4 +252,11 @@ public abstract class PriorityQueue<T> {
}
heap[i] = node; // install saved node
}
+
+ /** This method returns the internal heap array as Object[].
+ * @lucene.internal
+ */
+ protected final Object[] getHeapArray() {
+ return (Object[]) heap;
+ }
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/SmallFloat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/SmallFloat.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/SmallFloat.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/SmallFloat.java Tue Mar 15 21:35:17 2011
@@ -39,7 +39,7 @@ public class SmallFloat {
int fzero = (63-zeroExp)<<numMantissaBits;
int bits = Float.floatToRawIntBits(f);
int smallfloat = bits >> (24-numMantissaBits);
- if (smallfloat < fzero) {
+ if (smallfloat <= fzero) {
return (bits<=0) ?
(byte)0 // negative numbers and zero both map to 0 byte
:(byte)1; // underflow is mapped to smallest non-zero number.
@@ -75,7 +75,7 @@ public class SmallFloat {
public static byte floatToByte315(float f) {
int bits = Float.floatToRawIntBits(f);
int smallfloat = bits >> (24-3);
- if (smallfloat < (63-15)<<3) {
+ if (smallfloat <= ((63-15)<<3)) {
return (bits<=0) ? (byte)0 : (byte)1;
}
if (smallfloat >= ((63-15)<<3) + 0x100) {
@@ -103,7 +103,7 @@ public class SmallFloat {
public static byte floatToByte52(float f) {
int bits = Float.floatToRawIntBits(f);
int smallfloat = bits >> (24-5);
- if (smallfloat < (63-2)<<5) {
+ if (smallfloat <= (63-2)<<5) {
return (bits<=0) ? (byte)0 : (byte)1;
}
if (smallfloat >= ((63-2)<<5) + 0x100) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/Version.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/Version.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/Version.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/Version.java Tue Mar 15 21:35:17 2011
@@ -41,6 +41,13 @@ public enum Version {
*/
@Deprecated
LUCENE_31,
+
+ /**
+ * Match settings and bugs in Lucene's 3.2 release.
+ * @deprecated (4.0) Use latest
+ */
+ @Deprecated
+ LUCENE_32,
/** Match settings and bugs in Lucene's 4.0 release.
* <p>
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Tue Mar 15 21:35:17 2011
@@ -83,7 +83,7 @@ public class Builder<T> {
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
frontier = f;
for(int idx=0;idx<frontier.length;idx++) {
- frontier[idx] = new UnCompiledNode<T>(this);
+ frontier[idx] = new UnCompiledNode<T>(this, idx);
}
}
@@ -91,7 +91,7 @@ public class Builder<T> {
return fst.nodeCount;
}
- public int getTermCount() {
+ public long getTermCount() {
return frontier[0].inputCount;
}
@@ -201,7 +201,7 @@ public class Builder<T> {
// undecided on whether to prune it. later, it
// will be either compiled or pruned, so we must
// allocate a new node:
- frontier[idx] = new UnCompiledNode<T>(this);
+ frontier[idx] = new UnCompiledNode<T>(this, idx);
}
}
}
@@ -292,7 +292,7 @@ public class Builder<T> {
new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(frontier, 0, next, 0, frontier.length);
for(int idx=frontier.length;idx<next.length;idx++) {
- next[idx] = new UnCompiledNode<T>(this);
+ next[idx] = new UnCompiledNode<T>(this, idx);
}
frontier = next;
}
@@ -422,14 +422,24 @@ public class Builder<T> {
Arc<T>[] arcs;
T output;
boolean isFinal;
- int inputCount;
+ long inputCount;
+ /** This node's depth, starting from the automaton root. */
+ final int depth;
+
+ /**
+ * @param depth
+ * The node's depth starting from the automaton root. Needed for
+ * LUCENE-2934 (node expansion based on conditions other than the
+ * fanout size).
+ */
@SuppressWarnings("unchecked")
- public UnCompiledNode(Builder<T> owner) {
+ public UnCompiledNode(Builder<T> owner, int depth) {
this.owner = owner;
arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<T>();
output = owner.NO_OUTPUT;
+ this.depth = depth;
}
public boolean isCompiled() {
@@ -441,6 +451,9 @@ public class Builder<T> {
isFinal = false;
output = owner.NO_OUTPUT;
inputCount = 0;
+
+ // We don't clear the depth here because it never changes
+ // for nodes on the frontier (even when reused).
}
public T getLastOutput(int labelToMatch) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Tue Mar 15 21:35:17 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInpu
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode;
/** Represents an FST using a compact byte[] format.
* <p> The format is similar to what's used by Morfologik
@@ -47,11 +48,21 @@ public class FST<T> {
// this when number of arcs is > NUM_ARCS_ARRAY:
private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
- // If the node has >= this number of arcs, the arcs are
- // stored as a fixed array. Fixed array consumes more RAM
- // but enables binary search on the arcs (instead of
- // linear scan) on lookup by arc label:
- private final static int NUM_ARCS_FIXED_ARRAY = 10;
+ /**
+ * @see #shouldExpand(UnCompiledNode)
+ */
+ final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
+
+ /**
+ * @see #shouldExpand(UnCompiledNode)
+ */
+ final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
+
+ /**
+ * @see #shouldExpand(UnCompiledNode)
+ */
+ final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
+
private int[] bytesPerArc = new int[0];
// Increment version to change it
@@ -88,7 +99,7 @@ public class FST<T> {
public int arcWithOutputCount;
// If arc has this label then that arc is final/accepted
- public static int END_LABEL = -1;
+ public static final int END_LABEL = -1;
public final static class Arc<T> {
public int label;
@@ -286,9 +297,7 @@ public class FST<T> {
final int v;
if (inputType == INPUT_TYPE.BYTE1) {
v = in.readByte()&0xFF;
- } else if (inputType == INPUT_TYPE.BYTE2) {
- v = in.readVInt();
- } else {
+ } else {
v = in.readVInt();
}
return v;
@@ -315,7 +324,7 @@ public class FST<T> {
int startAddress = writer.posWrite;
//System.out.println(" startAddr=" + startAddress);
- final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY;
+ final boolean doFixedArray = shouldExpand(node);
final int fixedArrayStart;
if (doFixedArray) {
if (bytesPerArc.length < node.numArcs) {
@@ -467,9 +476,66 @@ public class FST<T> {
return arc;
}
- /** Follow the follow arc and read the first arc of its
- * target; this changes the provide arc (2nd arg) in-place
- * and returns it. */
+ /** Follows the <code>follow</code> arc and reads the last
+ * arc of its target; this changes the provided
+ * <code>arc</code> (2nd arg) in-place and returns it.
+ *
+ * @return Returns the second argument
+ * (<code>arc</code>). */
+ public Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc) throws IOException {
+ //System.out.println("readLast");
+ if (!targetHasArcs(follow)) {
+ //System.out.println(" end node");
+ assert follow.isFinal();
+ arc.label = -1;
+ arc.output = follow.nextFinalOutput;
+ arc.flags = BIT_LAST_ARC;
+ return arc;
+ } else {
+ final BytesReader in = getBytesReader(follow.target);
+ arc.flags = in.readByte();
+ if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
+ // array: jump straight to end
+ arc.numArcs = in.readVInt();
+ arc.bytesPerArc = in.readByte() & 0xFF;
+ //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
+ arc.posArcsStart = in.pos;
+ arc.arcIdx = arc.numArcs - 2;
+ } else {
+ // non-array: linear scan
+ arc.bytesPerArc = 0;
+ //System.out.println(" scan");
+ while(!arc.isLast()) {
+ // skip this arc:
+ readLabel(in);
+ if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
+ outputs.read(in);
+ }
+ if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
+ outputs.read(in);
+ }
+ if (arc.flag(BIT_STOP_NODE)) {
+ } else if (arc.flag(BIT_TARGET_NEXT)) {
+ } else {
+ in.pos -= 4;
+ }
+ arc.flags = in.readByte();
+ }
+ arc.nextArc = in.pos+1;
+ }
+ readNextRealArc(arc);
+ assert arc.isLast();
+ return arc;
+ }
+ }
+
+ /**
+ * Follow the <code>follow</code> arc and read the first arc of its target;
+ * this changes the provided <code>arc</code> (2nd arg) in-place and returns
+ * it.
+ *
+ * @return Returns the second argument (<code>arc</code>).
+ */
public Arc<T> readFirstTargetArc(Arc<T> follow, Arc<T> arc) throws IOException {
//int pos = address;
//System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal());
@@ -503,15 +569,30 @@ public class FST<T> {
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readByte() & 0xFF;
arc.arcIdx = -1;
- arc.posArcsStart = in.pos;
+ arc.nextArc = arc.posArcsStart = in.pos;
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
} else {
- in.pos++;
+ arc.nextArc = address;
arc.bytesPerArc = 0;
}
- arc.nextArc = in.pos;
- arc.label = 0;
- return readNextArc(arc);
+ return readNextRealArc(arc);
+ }
+
+ /**
+ * Checks if <code>arc</code>'s target state is in expanded (or vector) format.
+ *
+ * @return Returns <code>true</code> if <code>arc</code> points to a state in an
+ * expanded array format.
+ */
+ boolean isExpandedTarget(Arc<T> follow) throws IOException {
+ if (follow.isFinal()) {
+ return false;
+ } else {
+ final BytesReader in = getBytesReader(follow.target);
+ final byte b = in.readByte();
+
+ return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
+ }
}
/** In-place read; returns the arc. */
@@ -566,6 +647,7 @@ public class FST<T> {
if (arc.bytesPerArc != 0) {
// arcs are at fixed entries
arc.arcIdx++;
+ assert arc.arcIdx < arc.numArcs;
in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc);
} else {
// arcs are packed
@@ -708,6 +790,26 @@ public class FST<T> {
public int getArcWithOutputCount() {
return arcWithOutputCount;
}
+
+ /**
+ * Nodes will be expanded if their depth (distance from the root node) is
+ * <= this value and their number of arcs is >=
+ * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}.
+ *
+ * <p>
+ * Fixed array consumes more RAM but enables binary search on the arcs
+ * (instead of a linear scan) on lookup by arc label.
+ *
+ * @return <code>true</code> if <code>node</code> should be stored in an
+ * expanded (array) form.
+ *
+ * @see #FIXED_ARRAY_NUM_ARCS_DEEP
+ * @see Builder.UnCompiledNode#depth
+ */
+ private boolean shouldExpand(UnCompiledNode<T> node) {
+ return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+ node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP;
+ }
// Non-static: writes to FST's byte[]
class BytesWriter extends DataOutput {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java Tue Mar 15 21:35:17 2011
@@ -274,7 +274,7 @@ abstract class FSTEnum<T> {
while(true) {
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());
- if (arc.bytesPerArc != 0 && arc.label != -1) {
+ if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
// Arcs are fixed array -- use binary search to find
// the target.
@@ -465,12 +465,7 @@ abstract class FSTEnum<T> {
}
incr();
- final FST.Arc<T> nextArc = getArc(upto);
- fst.readFirstTargetArc(arc, nextArc);
- arc = nextArc;
- while(!arc.isLast()) {
- fst.readNextArc(arc);
- }
+ arc = fst.readLastTargetArc(arc, getArc(upto));
}
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java Tue Mar 15 21:35:17 2011
@@ -17,12 +17,8 @@ package org.apache.lucene.util.automaton
* limitations under the License.
*/
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Set;
-import java.util.HashSet;
+import java.io.*;
+import java.util.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
@@ -160,90 +156,171 @@ public final class Util {
return output;
}
}
+
+ /**
+ * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
+ * for visualization. Example of use:
+ *
+ * <pre>
+ * PrintStream ps = new PrintStream("out.dot");
+ * fst.toDot(ps);
+ * ps.close();
+ * </pre>
+ *
+ * and then, from command line:
+ *
+ * <pre>
+ * dot -Tpng -o out.png out.dot
+ * </pre>
+ *
+ * <p>
+ * Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
+ *
+ * @param sameRank
+ * If <code>true</code>, the resulting <code>dot</code> file will try
+ * to order states in layers of breadth-first traversal. This may
+ * mess up arcs, but makes the output FST's structure a bit clearer.
+ *
+ * @param labelStates
+ * If <code>true</code> states will have labels equal to their offsets in their
+ * binary format. Expands the graph considerably.
+ *
+ * @see "http://www.graphviz.org/"
+ */
+ public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
+ throws IOException {
+ final String expandedNodeColor = "blue";
+ // This is the start arc in the automaton (from the epsilon state to the first state
+ // with outgoing transitions.
+ final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
- // NOTE: this consumes alot of RAM!
- // arcs w/ NEXT opto are in blue
- /*
- eg:
- PrintStream ps = new PrintStream("out.dot");
- fst.toDot(ps);
- ps.close();
- System.out.println("SAVED out.dot");
-
- then dot -Tpng out.dot > /x/tmp/out.png
- */
+ // A queue of transitions to consider for the next level.
+ final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();
- public static<T> void toDot(FST<T> fst, PrintStream out) throws IOException {
+ // A queue of transitions to consider when processing the next level.
+ final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
+ nextLevelQueue.add(startArc);
- final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
+ // A list of states on the same level (for ranking).
+ final List<Integer> sameLevelStates = new ArrayList<Integer>();
- final List<FST.Arc<T>> queue = new ArrayList<FST.Arc<T>>();
- queue.add(startArc);
+ // A bitset of already seen states (target offset).
+ final BitSet seen = new BitSet();
+ seen.set(startArc.target);
+
+ // Shape for states.
+ final String stateShape = "circle";
+
+ // Emit DOT prologue.
+ out.write("digraph FST {\n");
+ out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
- final Set<Integer> seen = new HashSet<Integer>();
- seen.add(startArc.target);
-
- out.println("digraph FST {");
- out.println(" rankdir = LR;");
- //out.println(" " + startNode + " [shape=circle label=" + startNode + "];");
- out.println(" " + startArc.target + " [label=\"\" shape=circle];");
- out.println(" initial [shape=point color=white label=\"\"];");
- out.println(" initial -> " + startArc.target);
+ if (!labelStates) {
+ out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
+ }
+
+ emitDotState(out, "initial", "point", "white", "");
+ emitDotState(out, Integer.toString(startArc.target), stateShape,
+ fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
+ "");
+ out.write(" initial -> " + startArc.target + "\n");
final T NO_OUTPUT = fst.outputs.getNoOutput();
+ int level = 0;
+
+ while (!nextLevelQueue.isEmpty()) {
+ // we could double buffer here, but it doesn't matter probably.
+ thisLevelQueue.addAll(nextLevelQueue);
+ nextLevelQueue.clear();
+
+ level++;
+ out.write("\n // Transitions and states at level: " + level + "\n");
+ while (!thisLevelQueue.isEmpty()) {
+ final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
+
+ if (fst.targetHasArcs(arc)) {
+ // scan all arcs
+ final int node = arc.target;
+ fst.readFirstTargetArc(arc, arc);
+
+ while (true) {
+ // Emit the unseen state and add it to the queue for the next level.
+ if (arc.target >= 0 && !seen.get(arc.target)) {
+ final boolean isExpanded = fst.isExpandedTarget(arc);
+ emitDotState(out, Integer.toString(arc.target), stateShape,
+ isExpanded ? expandedNodeColor : null,
+ labelStates ? Integer.toString(arc.target) : "");
+ seen.set(arc.target);
+ nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
+ sameLevelStates.add(arc.target);
+ }
- while(queue.size() != 0) {
- FST.Arc<T> arc = queue.get(queue.size()-1);
- queue.remove(queue.size()-1);
- //System.out.println("dot cycle target=" + arc.target);
-
- if (fst.targetHasArcs(arc)) {
-
- // scan all arcs
- final int node = arc.target;
- fst.readFirstTargetArc(arc, arc);
- while(true) {
-
- //System.out.println(" cycle label=" + arc.label + " (" + (char) arc.label + ") target=" + arc.target);
- if (!seen.contains(arc.target)) {
- final String shape;
- if (arc.target == -1) {
- shape = "doublecircle";
+ String outs;
+ if (arc.output != NO_OUTPUT) {
+ outs = "/" + fst.outputs.outputToString(arc.output);
} else {
- shape = "circle";
+ outs = "";
+ }
+
+ final String cl;
+ if (arc.label == FST.END_LABEL) {
+ cl = "~";
+ } else {
+ cl = printableLabel(arc.label);
+ }
+
+ out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");
+
+ // Break the loop if we're on the last arc of this state.
+ if (arc.isLast()) {
+ break;
}
- out.println(" " + arc.target + " [shape=" + shape + "];");
- seen.add(arc.target);
- queue.add(new FST.Arc<T>().copyFrom(arc));
- //System.out.println(" new!");
- }
- String outs;
- if (arc.output != NO_OUTPUT) {
- outs = "/" + fst.outputs.outputToString(arc.output);
- } else {
- outs = "";
- }
- final char cl;
- if (arc.label == FST.END_LABEL) {
- cl = '~';
- } else {
- cl = (char) arc.label;
- }
- out.println(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]");
- //if (arc.flag(FST.BIT_TARGET_NEXT)) {
- //out.print(" color=blue");
- //}
- //out.println("];");
-
- if (arc.isLast()) {
- break;
- } else {
fst.readNextArc(arc);
}
}
}
+
+ // Emit state ranking information.
+ if (sameRank && sameLevelStates.size() > 1) {
+ out.write(" {rank=same; ");
+ for (int state : sameLevelStates) {
+ out.write(state + "; ");
+ }
+ out.write(" }\n");
+ }
+ sameLevelStates.clear();
+ }
+
+ // Emit terminating state (always there anyway).
+ out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
+ out.write(" {rank=sink; -1 }\n");
+
+ out.write("}\n");
+ out.flush();
+ }
+
+ /**
+ * Emit a single state in the <code>dot</code> language.
+ */
+ private static void emitDotState(Writer out, String name, String shape,
+ String color, String label) throws IOException {
+ out.write(" " + name
+ + " ["
+ + (shape != null ? "shape=" + shape : "") + " "
+ + (color != null ? "color=" + color : "") + " "
+ + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " "
+ + "]\n");
+ }
+
+ /**
+ * Ensures an arc's label is indeed printable (dot uses US-ASCII).
+ */
+ private static String printableLabel(int label) {
+ if (label >= 0x20 && label <= 0x7d) {
+ return Character.toString((char) label);
+ } else {
+ return "0x" + Integer.toHexString(label);
}
- out.println("}");
}
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/overview.html?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/overview.html (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/overview.html Tue Mar 15 21:35:17 2011
@@ -159,27 +159,19 @@ method.</li>
Some simple examples of code which does this are:
<ul>
<li>
- <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/src/demo/org/apache/lucene/demo/FileDocument.java">FileDocument.java</a> contains
-code to create a Document for a file.</li>
-
-<li>
- <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/src/demo/org/apache/lucene/demo/IndexFiles.java">IndexFiles.java</a> creates an
+ <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexFiles.java">IndexFiles.java</a> creates an
index for all the files contained in a directory.</li>
<li>
- <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/src/demo/org/apache/lucene/demo/DeleteFiles.java">DeleteFiles.java</a> deletes some
-of these files from the index.</li>
-
-<li>
- <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/src/demo/org/apache/lucene/demo/SearchFiles.java">SearchFiles.java</a> prompts for
+ <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java">SearchFiles.java</a> prompts for
queries and searches an index.</li>
</ul>
To demonstrate these, try something like:
-<blockquote><tt>> <b>java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.IndexFiles rec.food.recipes/soups</b></tt>
+<blockquote><tt>> <b>java -cp lucene.jar:lucene-demo.jar:lucene-analyzers-common.jar org.apache.lucene.demo.IndexFiles rec.food.recipes/soups</b></tt>
<br><tt>adding rec.food.recipes/soups/abalone-chowder</tt>
<br><tt> </tt>[ ... ]
-<p><tt>> <b>java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.SearchFiles</b></tt>
+<p><tt>> <b>java -cp lucene.jar:lucene-demo.jar:lucene-analyzers-common.jar org.apache.lucene.demo.SearchFiles</b></tt>
<br><tt>Query: <b>chowder</b></tt>
<br><tt>Searching for: chowder</tt>
<br><tt>34 total matching documents</tt>
@@ -195,16 +187,5 @@ and the word "manhattan" ... ]
<br> [ Note: "+" and "-" are canonical, but "AND", "OR"
and "NOT" may be used. ]</blockquote>
-The <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/src/demo/org/apache/lucene/demo/IndexHTML.java">IndexHTML</a> demo is more sophisticated.
-It incrementally maintains an index of HTML files, adding new files as
-they appear, deleting old files as they disappear and re-indexing files
-as they change.
-<blockquote><tt>> <b>java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.IndexHTML -create java/jdk1.1.6/docs/relnotes</b></tt>
-<br><tt>adding java/jdk1.1.6/docs/relnotes/SMICopyright.html</tt>
-<br><tt> </tt>[ ... create an index containing all the relnotes ]
-<p><tt>> <b>rm java/jdk1.1.6/docs/relnotes/smicopyright.html</b></tt>
-<p><tt>> <b>java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.IndexHTML java/jdk1.1.6/docs/relnotes</b></tt>
-<br><tt>deleting java/jdk1.1.6/docs/relnotes/SMICopyright.html</tt></blockquote>
-
</body>
</html>
Modified: lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo.xml (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo.xml Tue Mar 15 21:35:17 2011
@@ -18,26 +18,26 @@ It walks you through some basic installa
</section>
-<section id="About the Demos"><title>About the Demos</title>
+<section id="About the Demo"><title>About the Demo</title>
<p>
-The Lucene command-line demo code consists of two applications that demonstrate various
-functionalities of Lucene and how one should go about adding Lucene to their applications.
+The Lucene command-line demo code consists of an application that demonstrates various
+functionalities of Lucene and how you can add Lucene to your applications.
</p>
</section>
<section id="Setting your CLASSPATH"><title>Setting your CLASSPATH</title>
<p>
First, you should <a href="http://www.apache.org/dyn/closer.cgi/lucene/java/">download</a> the
-latest Lucene distribution and then extract it to a working directory. Alternatively, you can <a
-href="http://wiki.apache.org/lucene-java/SourceRepository">check out the sources from
-Subversion</a>, and then run <code>ant war-demo</code> to generate the JARs and WARs.
+latest Lucene distribution and then extract it to a working directory.
</p>
<p>
-You should see the Lucene JAR file in the directory you created when you extracted the archive. It
-should be named something like <code>lucene-core-{version}.jar</code>. You should also see a file
-called <code>lucene-demos-{version}.jar</code>. If you checked out the sources from Subversion then
-the JARs are located under the <code>build</code> subdirectory (after running <code>ant</code>
-successfully). Put both of these files in your Java CLASSPATH.
+You need three JARs: the Lucene JAR, the common analysis JAR, and the Lucene demo JAR. You should
+see the Lucene JAR file in the directory you created when you extracted the archive -- it
+should be named something like <code>lucene-core-{version}.jar</code>. You should also see files
+called <code>lucene-analyzers-common-{version}.jar</code> and <code>lucene-demo-{version}.jar</code>.
+</p>
+<p>
+Put all three of these files in your Java CLASSPATH.
</p>
</section>
@@ -47,7 +47,7 @@ Once you've gotten this far you're proba
you've set your CLASSPATH correctly, just type:
<pre>
- java org.apache.lucene.demo.IndexFiles {full-path-to-lucene}/src
+ java org.apache.lucene.demo.IndexFiles -docs {path-to-lucene}/src
</pre>
This will produce a subdirectory called <code>index</code> which will contain an index of all of the
Modified: lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo2.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo2.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo2.xml (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/demo2.xml Tue Mar 15 21:35:17 2011
@@ -22,10 +22,15 @@ how to use Lucene in their applications.
<section id="Location of the source"><title>Location of the source</title>
<p>
-Relative to the directory created when you extracted Lucene or retrieved it from Subversion, you
-should see a directory called <code>src</code> which in turn contains a directory called
-<code>demo</code>. This is the root for all of the Lucene demos. Under this directory is
-<code>org/apache/lucene/demo</code>. This is where all the Java sources for the demos live.
+NOTE: to examine the sources, you need to download and extract a source checkout of
+Lucene: (lucene-{version}-src.zip).
+</p>
+
+<p>
+Relative to the directory created when you extracted Lucene, you
+should see a directory called <code>lucene/contrib/demo/</code>. This is the root for the Lucene
+demo. Under this directory is <code>src/java/org/apache/lucene/demo/</code>. This is where all
+the Java sources for the demo live.
</p>
<p>
@@ -39,62 +44,80 @@ Bring it up in <code>vi</code> or your e
<p>
As we discussed in the previous walk-through, the <a
-href="api/demo/org/apache/lucene/demo/IndexFiles.html">IndexFiles</a> class creates a Lucene
+href="api/contrib-demo/org/apache/lucene/demo/IndexFiles.html">IndexFiles</a> class creates a Lucene
Index. Let's take a look at how it does this.
</p>
<p>
-The first substantial thing the <code>main</code> function does is instantiate <a
-href="api/core/org/apache/lucene/index/IndexWriter.html">IndexWriter</a>. It passes the string
-"<code>index</code>" and a new instance of a class called <a
-href="api/core/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
-The "<code>index</code>" string is the name of the filesystem directory where all index information
-should be stored. Because we're not passing a full path, this will be created as a subdirectory of
-the current working directory (if it does not already exist). On some platforms, it may be created
-in other directories (such as the user's home directory).
+The <code>main()</code> method parses the command-line parameters, then in preparation for
+instantiating <a href="api/core/org/apache/lucene/index/IndexWriter.html">IndexWriter</a>, opens a
+<a href="api/core/org/apache/lucene/store/Directory.html">Directory</a> and instantiates
+<a href="api/module-analysis-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html"
+>StandardAnalyzer</a> and
+<a href="api/core/org/apache/lucene/index/IndexWriterConfig.html">IndexWriterConfig</a>.
</p>
<p>
-The <a href="api/core/org/apache/lucene/index/IndexWriter.html">IndexWriter</a> is the main
-class responsible for creating indices. To use it you must instantiate it with a path that it can
-write the index into. If this path does not exist it will first create it. Otherwise it will
-refresh the index at that path. You can also create an index using one of the subclasses of <a
-href="api/core/org/apache/lucene/store/Directory.html">Directory</a>. In any case, you must also pass an
-instance of <a
-href="api/core/org/apache/lucene/analysis/Analyzer.html">org.apache.lucene.analysis.Analyzer</a>.
+The value of the <code>-index</code> command-line parameter is the name of the filesystem directory
+where all index information should be stored. If <code>IndexFiles</code> is invoked with a
+relative path given in the <code>-index</code> command-line parameter, or if the <code>-index</code>
+command-line parameter is not given, causing the default relative index path "<code>index</code>"
+to be used, the index path will be created as a subdirectory of the current working directory
+(if it does not already exist). On some platforms, the index path may be created in a different
+directory (such as the user's home directory).
</p>
<p>
-The particular <a href="api/core/org/apache/lucene/analysis/Analyzer.html">Analyzer</a> we
-are using, <a
-href="api/core/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>, is
-little more than a standard Java Tokenizer, converting all strings to lowercase and filtering out
-stop words and characters from the index. By stop words and characters I mean common language
-words such as articles (a, an, the, etc.) and other strings that may have less value for searching
-(e.g. <b>'s</b>) . It should be noted that there are different rules for every language, and you
-should use the proper analyzer for each. Lucene currently provides Analyzers for a number of
-different languages (see the <code>*Analyzer.java</code> sources under <a
-href="http://svn.apache.org/repos/asf/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/">modules/analysis/common/src/java/org/apache/lucene/analysis</a>).
+The <code>-docs</code> command-line parameter value is the location of the directory containing
+files to be indexed.
+</p>
+
+<p>
+The <code>-update</code> command-line parameter tells <code>IndexFiles</code> not to delete the
+index if it already exists. When <code>-update</code> is not given, <code>IndexFiles</code> will
+first wipe the slate clean before indexing any documents.
</p>
<p>
-Looking further down in the file, you should see the <code>indexDocs()</code> code. This recursive
-function simply crawls the directories and uses <a
-href="api/demo/org/apache/lucene/demo/FileDocument.html">FileDocument</a> to create <a
-href="api/core/org/apache/lucene/document/Document.html">Document</a> objects. The <a
-href="api/core/org/apache/lucene/document/Document.html">Document</a> is simply a data object to
-represent the content in the file as well as its creation time and location. These instances are
-added to the <code>indexWriter</code>. Take a look inside <a
-href="api/demo/org/apache/lucene/demo/FileDocument.html">FileDocument</a>. It's not particularly
-complicated. It just adds fields to the <a
-href="api/core/org/apache/lucene/document/Document.html">Document</a>.
+Lucene <a href="api/core/org/apache/lucene/store/Directory.html">Directory</a>s are used by the
+<code>IndexWriter</code> to store information in the index. In addition to the
+<a href="api/core/org/apache/lucene/store/FSDirectory.html">FSDirectory</a> implementation we are using,
+there are several other <code>Directory</code> subclasses that can write to RAM, to databases, etc.
+</p>
+
+<p>
+Lucene <a href="api/core/org/apache/lucene/analysis/Analyzer.html">Analyzer</a>s are processing pipelines
+that break up text into indexed tokens, a.k.a. terms, and optionally perform other operations on these
+tokens, e.g. downcasing, synonym insertion, filtering out unwanted tokens, etc. The <code>Analyzer</code>
+we are using is <code>StandardAnalyzer</code>, which creates tokens using the Word Break rules from the
+Unicode Text Segmentation algorithm specified in <a href="http://unicode.org/reports/tr29/">Unicode
+Standard Annex #29</a>; converts tokens to lowercase; and then filters out stopwords. Stopwords are
+common language words such as articles (a, an, the, etc.) and other tokens that may have less value for
+searching. It should be noted that there are different rules for every language, and you should use the
+proper analyzer for each. Lucene currently provides Analyzers for a number of different languages (see
+the javadocs under
+<a href="api/module-analysis-common/org/apache/lucene/analysis/package-summary.html"
+>modules/analysis/common/src/java/org/apache/lucene/analysis</a>).
</p>
<p>
-As you can see there isn't much to creating an index. The devil is in the details. You may also
-wish to examine the other samples in this directory, particularly the <a
-href="api/demo/org/apache/lucene/demo/IndexHTML.html">IndexHTML</a> class. It is a bit more
-complex but builds upon this example.
+The <code>IndexWriterConfig</code> instance holds all configuration for <code>IndexWriter</code>. For
+example, we set the <code>OpenMode</code> to use here based on the value of the <code>-update</code>
+command-line parameter.
+</p>
+
+<p>
+Looking further down in the file, after <code>IndexWriter</code> is instantiated, you should see the
+<code>indexDocs()</code> code. This recursive function crawls the directories and creates
+<a href="api/core/org/apache/lucene/document/Document.html">Document</a> objects. The
+<code>Document</code> is simply a data object to represent the text content from the file as well as
+its creation time and location. These instances are added to the <code>IndexWriter</code>. If
+the <code>-update</code> command-line parameter is given, the <code>IndexWriter</code>
+<code>OpenMode</code> will be set to <code>OpenMode.CREATE_OR_APPEND</code>, and rather than
+adding documents to the index, the <code>IndexWriter</code> will <strong>update</strong> them
+in the index by attempting to find an already-indexed document with the same identifier (in our
+case, the file path serves as the identifier); deleting it from the index if it exists; and then
+adding the new document to the index.
</p>
</section>
@@ -102,43 +125,30 @@ complex but builds upon this example.
<section id="Searching Files"><title>Searching Files</title>
<p>
-The <a href="api/demo/org/apache/lucene/demo/SearchFiles.html">SearchFiles</a> class is
-quite simple. It primarily collaborates with an <a
-href="api/core/org/apache/lucene/search/IndexSearcher.html">IndexSearcher</a>, <a
-href="api/core/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>
-(which is used in the <a
-href="api/core/org/apache/lucene/demo/IndexFiles.html">IndexFiles</a> class as well) and a
-<a href="api/core/org/apache/lucene/queryParser/QueryParser.html">QueryParser</a>. The
+The <a href="api/contrib-demo/org/apache/lucene/demo/SearchFiles.html">SearchFiles</a> class is
+quite simple. It primarily collaborates with an
+<a href="api/core/org/apache/lucene/search/IndexSearcher.html">IndexSearcher</a>,
+<a href="api/modules-analysis-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html"
+>StandardAnalyzer</a> (which is used in the
+<a href="api/contrib-demo/org/apache/lucene/demo/IndexFiles.html">IndexFiles</a> class as well)
+and a <a href="api/core/org/apache/lucene/queryParser/QueryParser.html">QueryParser</a>. The
query parser is constructed with an analyzer used to interpret your query text in the same way the
-documents are interpreted: finding the end of words and removing useless words like 'a', 'an' and
-'the'. The <a href="api/core/org/apache/lucene/search/Query.html">Query</a> object contains
-the results from the <a
-href="api/core/org/apache/lucene/queryParser/QueryParser.html">QueryParser</a> which is passed to
-the searcher. Note that it's also possible to programmatically construct a rich <a
-href="api/core/org/apache/lucene/search/Query.html">Query</a> object without using the query
+documents are interpreted: finding word boundaries, downcasing, and removing useless words like
+'a', 'an' and 'the'. The <a href="api/core/org/apache/lucene/search/Query.html">Query</a>
+object contains the results from the
+<a href="api/core/org/apache/lucene/queryParser/QueryParser.html">QueryParser</a> which is passed
+to the searcher. Note that it's also possible to programmatically construct a rich
+<a href="api/core/org/apache/lucene/search/Query.html">Query</a> object without using the query
parser. The query parser just enables decoding the <a href="queryparsersyntax.html">Lucene query
-syntax</a> into the corresponding <a
-href="api/core/org/apache/lucene/search/Query.html">Query</a> object. Search can be executed in
-two different ways:
-<ul>
-<li>Streaming: A <a href="api/core/org/apache/lucene/search/Collector.html">Collector</a> subclass
-simply prints out the document ID and score for each matching document.</li>
-<li>Paging: Using a <a href="api/core/org/apache/lucene/search/TopScoreDocCollector.html">TopScoreDocCollector</a>
- the search results are printed in pages, sorted by score (i. e. relevance).</li>
-</ul>
+syntax</a> into the corresponding <a href="api/core/org/apache/lucene/search/Query.html">Query</a>
+object.
</p>
-</section>
-
-<section id="The Web example..."><title>The Web example...</title>
-
<p>
-<a href="demo3.html">read on>>></a>
+<code>SearchFiles</code> uses the <code>IndexSearcher.search(query,n)</code> method that returns
+<a href="api/core/org/apache/lucene/search/TopDocs.html">TopDocs</a> with max <code>n</code> hits.
+The results are printed in pages, sorted by score (i.e. relevance).
</p>
-
</section>
-
</body>
</document>
-
-
Modified: lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/fileformats.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/fileformats.xml Tue Mar 15 21:35:17 2011
@@ -87,6 +87,10 @@
uncompressed). See issue LUCENE-1960 for details.
</p>
+ <p>
+ In version 3.1, segments records the code version
+ that created them. See LUCENE-2720 for details.
+ </p>
</section>
<section id="Definitions"><title>Definitions</title>
@@ -928,8 +932,8 @@
written twice.
</p>
<p>
- <b>2.9</b>
- Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
+ <b>3.1</b>
+ Segments --> Format, Version, NameCounter, SegCount, <SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
NormGen<sup>NumField</sup>,
IsCompoundFile, DeletionCount, HasProx, Diagnostics><sup>SegCount</sup>, CommitUserData, Checksum
</p>
@@ -944,7 +948,7 @@
</p>
<p>
- SegName, DocStoreSegment --> String
+ SegVersion, SegName, DocStoreSegment --> String
</p>
<p>
@@ -974,6 +978,10 @@
</p>
<p>
+ SegVersion is the code version that created the segment.
+ </p>
+
+ <p>
SegName is the name of the segment, and is used as the file name prefix
for all of the files that compose the segment's index.
</p>
Modified: lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml Tue Mar 15 21:35:17 2011
@@ -14,7 +14,7 @@
<title>Getting Started</title>
<p>
This document is intended as a "getting started" guide. It has three audiences: first-time users
-looking to install Apache Lucene in their application or web server; developers looking to modify or base
+looking to install Apache Lucene in their application; developers looking to modify or base
the applications they develop on Lucene; and developers looking to become involved in and contribute
to the development of Lucene. This document is written in tutorial and walk-through format. The
goal is to help you "get started". It does not go into great depth on some of the conceptual or
@@ -33,20 +33,6 @@ may wish to skip sections.
<li><a href="demo2.html">About the sources and implementation for the command-line Lucene
demo</a>. This section walks through the implementation details (sources) of the
command-line Lucene demo. This section is intended for developers.</li> <p/>
-
- <li><a href="demo3.html">About installing and configuring the demo template web
- application</a>. While this walk-through assumes Tomcat as your container of choice,
- there is no reason you can't (provided you have the requisite knowledge) adapt the
- instructions to your container. This section is intended for those responsible for the
- development or deployment of Lucene-based web applications.</li> <p/>
-
- <li><a href="demo4.html">About the sources used to construct the demo template web
- application</a>. Please note the template application is designed to highlight features of
- Lucene and is <b>not</b> an example of best practices. (One would hopefully use MVC
- architecture such as provided by Jakarta Struts and taglibs, but showing you how to do that
- would be WAY beyond the scope of this guide.) This section is intended for developers and
- those wishing to customize the demo template web application to their needs. </li>
-
</ul>
</section>
Modified: lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml Tue Mar 15 21:35:17 2011
@@ -59,6 +59,12 @@
<p>The benchmark contribution contains tools for benchmarking Lucene using standard, freely available corpora.</p>
<p>See <a href="../api/contrib-benchmark/index.html">benchmark javadoc</a></p>
</section>
+
+ <section id="demo"><title>demo</title>
+ <p>The demo contrib contains the Lucene demo: IndexFiles and SearchFiles, described under
+ <a href="../gettingstarted.html">Getting Started</a>.</p>
+ <p>See <a href="../api/contrib-demo/index.html">demo javadoc</a></p>
+ </section>
<section id="db"><title>db</title>
<p>Provides integration with Berkley DB.</p>
Modified: lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/site.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/site.xml (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/site/src/documentation/content/xdocs/site.xml Tue Mar 15 21:35:17 2011
@@ -50,12 +50,13 @@ See http://forrest.apache.org/docs/linki
<javadoc label="Javadocs">
<javadoc-all label="All" href="ext:javadocs-all"/>
<javadoc-core label="Core" href="ext:javadocs-core"/>
- <javadoc-demo label="Demo" href="ext:javadocs-demo"/>
+ <javadoc-test-framework label="Test Framework" href="ext:javadocs-test-framework"/>
<javadoc-contrib label="Contrib">
<javadoc-contrib-ant label="Ant" href="ext:javadocs-contrib-ant"/>
<javadoc-contrib-bdb label="Bdb" href="ext:javadocs-contrib-bdb"/>
<javadoc-contrib-bdb-je label="Bdb-je" href="ext:javadocs-contrib-bdb-je"/>
<javadoc-contrib-benchmark label="Benchmark" href="ext:javadocs-contrib-benchmark"/>
+ <javadoc-contrib-demo label="Demo" href="ext:javadocs-contrib-demo"/>
<javadoc-contrib-highlighter label="Highlighter" href="ext:javadocs-contrib-highlighter"/>
<javadoc-contrib-instantiated label="Instantiated" href="ext:javadocs-contrib-instantiated"/>
<javadoc-contrib-lucli label="Lucli" href="ext:javadocs-contrib-lucli"/>
@@ -98,11 +99,12 @@ See http://forrest.apache.org/docs/linki
<changes-contrib href="changes/Contrib-Changes.html" />
<javadocs-all href="api/all/index.html"/>
<javadocs-core href="api/core/index.html"/>
- <javadocs-demo href="api/demo/index.html"/>
+ <javadocs-test-framework href="api/test-framework/index.html"/>
<javadocs-contrib-ant href="api/contrib-ant/index.html"/>
<javadocs-contrib-bdb href="api/contrib-bdb/index.html"/>
<javadocs-contrib-bdb-je href="api/contrib-bdb-je/index.html"/>
<javadocs-contrib-benchmark href="api/contrib-benchmark/index.html"/>
+ <javadocs-contrib-demo href="api/contrib-demo/index.html"/>
<javadocs-contrib-highlighter href="api/contrib-highlighter/index.html"/>
<javadocs-contrib-instantiated href="api/contrib-instantiated/index.html"/>
<javadocs-contrib-lucli href="api/contrib-lucli/index.html"/>
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java Tue Mar 15 21:35:17 2011
@@ -36,6 +36,10 @@ public final class MockAnalyzer extends
private final boolean payload;
private int positionIncrementGap;
+ /**
+ * Calls {@link #MockAnalyzer(CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean, boolean)
+ * MockAnalyzer(runAutomaton, lowerCase, filter, enablePositionIncrements, true}).
+ */
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
this(runAutomaton, lowerCase, filter, enablePositionIncrements, true);
}
@@ -47,7 +51,7 @@ public final class MockAnalyzer extends
* @param lowerCase true if the tokenizer should lowercase terms
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
* @param enablePositionIncrements true if position increments should reflect filtered terms.
- * @param payload if payloads should be added
+ * @param payload if payloads should be added containing the positions (for testing)
*/
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements, boolean payload) {
this.runAutomaton = runAutomaton;
@@ -58,21 +62,26 @@ public final class MockAnalyzer extends
}
/**
- * Creates a new MockAnalyzer, with no filtering.
- *
- * @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
- * @param lowerCase true if the tokenizer should lowercase terms
+ * Calls {@link #MockAnalyzer(CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean, boolean)
+ * MockAnalyzer(runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false, true}).
*/
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
this(runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false, true);
}
+ /**
+ * Calls {@link #MockAnalyzer(CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean, boolean)
+ * MockAnalyzer(runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false, payload}).
+ */
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase, boolean payload) {
this(runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false, payload);
}
/**
- * Create a Whitespace-lowercasing analyzer with no stopwords removal
+ * Create a Whitespace-lowercasing analyzer with no stopwords removal.
+ * <p>
+ * Calls {@link #MockAnalyzer(CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean, boolean)
+ * MockAnalyzer(MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false, true}).
*/
public MockAnalyzer() {
this(MockTokenizer.WHITESPACE, true);
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenFilter.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenFilter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenFilter.java Tue Mar 15 21:35:17 2011
@@ -60,6 +60,13 @@ public final class MockTokenFilter exten
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ /**
+ * Create a new MockTokenFilter.
+ *
+ * @param input TokenStream to filter
+ * @param filter DFA representing the terms that should be removed.
+ * @param enablePositionIncrements true if the removal should accumulate position increments.
+ */
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
super(input);
this.filter = filter;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java Tue Mar 15 21:35:17 2011
@@ -20,7 +20,7 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
@@ -46,14 +46,14 @@ public class MockTokenizer extends CharT
private int state;
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
- super(Version.LUCENE_CURRENT, factory, input);
+ super(LuceneTestCase.TEST_VERSION_CURRENT, factory, input);
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.state = runAutomaton.getInitialState();
}
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
- super(Version.LUCENE_CURRENT, input);
+ super(LuceneTestCase.TEST_VERSION_CURRENT, input);
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.state = runAutomaton.getInitialState();
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java Tue Mar 15 21:35:17 2011
@@ -24,6 +24,7 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexWriter; // javadoc
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
@@ -91,6 +92,10 @@ public class RandomIndexWriter implement
}
}
+ /**
+ * Adds a Document.
+ * @see IndexWriter#addDocument(Document)
+ */
public void addDocument(Document doc) throws IOException {
w.addDocument(doc);
if (docCount++ == flushAt) {
@@ -102,6 +107,21 @@ public class RandomIndexWriter implement
}
}
+ /**
+ * Updates a document.
+ * @see IndexWriter#updateDocument(Term, Document)
+ */
+ public void updateDocument(Term t, Document doc) throws IOException {
+ w.updateDocument(t, doc);
+ if (docCount++ == flushAt) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("RIW.updateDocument: now doing a commit");
+ }
+ w.commit();
+ flushAt += _TestUtil.nextInt(r, 10, 1000);
+ }
+ }
+
public void addIndexes(Directory... dirs) throws CorruptIndexException, IOException {
w.addIndexes(dirs);
}
@@ -127,17 +147,21 @@ public class RandomIndexWriter implement
}
public IndexReader getReader() throws IOException {
+ return getReader(true);
+ }
+
+ public IndexReader getReader(boolean applyDeletions) throws IOException {
getReaderCalled = true;
if (r.nextInt(4) == 2)
w.optimize();
// If we are writing with PreFlexRW, force a full
// IndexReader.open so terms are sorted in codepoint
// order during searching:
- if (!w.codecs.getDefaultFieldCodec().equals("PreFlex") && r.nextBoolean()) {
+ if (!applyDeletions || !w.codecs.getDefaultFieldCodec().equals("PreFlex") && r.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println("RIW.getReader: use NRT reader");
}
- return w.getReader();
+ return w.getReader(applyDeletions);
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println("RIW.getReader: open new reader");
@@ -147,6 +171,10 @@ public class RandomIndexWriter implement
}
}
+ /**
+ * Close this writer.
+ * @see IndexWriter#close()
+ */
public void close() throws IOException {
// if someone isn't using getReader() API, we want to be sure to
// maybeOptimize since presumably they might open a reader on the dir.
@@ -156,6 +184,13 @@ public class RandomIndexWriter implement
w.close();
}
+ /**
+ * Forces an optimize.
+ * <p>
+ * NOTE: this should be avoided in tests unless absolutely necessary,
+ * as it will result in less test coverage.
+ * @see IndexWriter#optimize()
+ */
public void optimize() throws IOException {
w.optimize();
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java Tue Mar 15 21:35:17 2011
@@ -130,7 +130,7 @@ public class MockFixedIntBlockCodec exte
success = false;
try {
- FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@@ -174,7 +174,6 @@ public class MockFixedIntBlockCodec exte
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java Tue Mar 15 21:35:17 2011
@@ -150,7 +150,7 @@ public class MockVariableIntBlockCodec e
success = false;
try {
- FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@@ -195,7 +195,6 @@ public class MockVariableIntBlockCodec e
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java Tue Mar 15 21:35:17 2011
@@ -205,7 +205,7 @@ public class MockRandomCodec extends Cod
success = false;
try {
- FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@@ -306,7 +306,6 @@ public class MockRandomCodec extends Cod
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
termsCacheSize,
state.codecId);
success = true;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java Tue Mar 15 21:35:17 2011
@@ -70,7 +70,7 @@ public class MockSepCodec extends Codec
success = false;
try {
- FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@@ -114,7 +114,6 @@ public class MockSepCodec extends Codec
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java Tue Mar 15 21:35:17 2011
@@ -27,7 +27,7 @@ import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.index.BulkPostingsEnum;
/** Reads IndexInputs written with {@link
- * SingleIntIndexOutput}. NOTE: this class is just for
+ * MockSingleIntIndexOutput}. NOTE: this class is just for
* demonstration puprposes (it is a very slow way to read a
* block of ints).
*
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java Tue Mar 15 21:35:17 2011
@@ -39,8 +39,8 @@ public class CheckHits {
/**
* Tests that all documents up to maxDoc which are *not* in the
- * expected result set, have an explanation which indicates no match
- * (ie: Explanation value of 0.0f)
+ * expected result set, have an explanation which indicates that
+ * the document does not match
*/
public static void checkNoMatchExplanations(Query q, String defaultFieldName,
IndexSearcher searcher, int[] results)
@@ -59,9 +59,9 @@ public class CheckHits {
Explanation exp = searcher.explain(q, doc);
Assert.assertNotNull("Explanation of [["+d+"]] for #"+doc+" is null",
exp);
- Assert.assertEquals("Explanation of [["+d+"]] for #"+doc+
- " doesn't indicate non-match: " + exp.toString(),
- 0.0f, exp.getValue(), 0.0f);
+ Assert.assertFalse("Explanation of [["+d+"]] for #"+doc+
+ " doesn't indicate non-match: " + exp.toString(),
+ exp.isMatch());
}
}
@@ -268,7 +268,7 @@ public class CheckHits {
* query corresponds with the true score.
*
* @see ExplanationAsserter
- * @see #checkExplanations(Query, String, Searcher, boolean) for a
+ * @see #checkExplanations(Query, String, IndexSearcher, boolean) for a
* "deep" testing of the explanation details.
*
* @param query the query to test
@@ -484,6 +484,9 @@ public class CheckHits {
Assert.assertNotNull("Explanation of [["+d+"]] for #"+doc+" is null", exp);
verifyExplanation(d,doc,scorer.score(),deep,exp);
+ Assert.assertTrue("Explanation of [["+d+"]] for #"+ doc +
+ " does not indicate match: " + exp.toString(),
+ exp.isMatch());
}
@Override
public void setNextReader(AtomicReaderContext context) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/QueryUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/QueryUtils.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/QueryUtils.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/search/QueryUtils.java Tue Mar 15 21:35:17 2011
@@ -2,14 +2,13 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Random;
-import java.lang.reflect.Method;
import junit.framework.Assert;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiReader;
@@ -19,6 +18,7 @@ import org.apache.lucene.store.MockDirec
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.ReaderUtil;
+import org.apache.lucene.util._TestUtil;
import static org.apache.lucene.util.LuceneTestCase.TEST_VERSION_CURRENT;
@@ -172,16 +172,7 @@ public class QueryUtils {
}
w.commit();
w.deleteDocuments( new MatchAllDocsQuery() );
- try {
- // Carefully invoke what is a package-private (test
- // only, internal) method on IndexWriter:
- Method m = IndexWriter.class.getDeclaredMethod("keepFullyDeletedSegments");
- m.setAccessible(true);
- m.invoke(w);
- } catch (Exception e) {
- // Should not happen?
- throw new RuntimeException(e);
- }
+ _TestUtil.keepFullyDeletedSegments(w);
w.commit();
if (0 < numDeletedDocs)