You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/09/23 02:52:36 UTC
svn commit: r1388935 [2/2] - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/util/fst/
core/src/test/org/apache/lucene/util/fst/
misc/src/java/org/apache/lucene/util/
misc/src/java/org/apache/lucene/util/fst/ misc/src/test/org/apache/lu...
Added: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java?rev=1388935&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java (added)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java Sun Sep 23 00:52:36 2012
@@ -0,0 +1,832 @@
+package org.apache.lucene.util.fst;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+/** Helper class to test FSTs. */
+public class FSTTester<T> {
+
+ final Random random;
+ final List<InputOutput<T>> pairs;
+ final int inputMode;
+ final Outputs<T> outputs;
+ final Directory dir;
+ final boolean doReverseLookup;
+
+ public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
+ this.random = random;
+ this.dir = dir;
+ this.inputMode = inputMode;
+ this.pairs = pairs;
+ this.outputs = outputs;
+ this.doReverseLookup = doReverseLookup;
+ }
+
+ static String inputToString(int inputMode, IntsRef term) {
+ return inputToString(inputMode, term, true);
+ }
+
+ static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
+ if (!isValidUnicode) {
+ return term.toString();
+ } else if (inputMode == 0) {
+ // utf8
+ return toBytesRef(term).utf8ToString() + " " + term;
+ } else {
+ // utf32
+ return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
+ }
+ }
+
+ private static BytesRef toBytesRef(IntsRef ir) {
+ BytesRef br = new BytesRef(ir.length);
+ for(int i=0;i<ir.length;i++) {
+ int x = ir.ints[ir.offset+i];
+ assert x >= 0 && x <= 255;
+ br.bytes[i] = (byte) x;
+ }
+ br.length = ir.length;
+ return br;
+ }
+
+ static String getRandomString(Random random) {
+ final String term;
+ if (random.nextBoolean()) {
+ term = _TestUtil.randomRealisticUnicodeString(random);
+ } else {
+ // we want to mix in limited-alphabet symbols so
+ // we get more sharing of the nodes given how few
+ // terms we are testing...
+ term = simpleRandomString(random);
+ }
+ return term;
+ }
+
+ static String simpleRandomString(Random r) {
+ final int end = r.nextInt(10);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
+ }
+ return new String(buffer, 0, end);
+ }
+
+ static IntsRef toIntsRef(String s, int inputMode) {
+ return toIntsRef(s, inputMode, new IntsRef(10));
+ }
+
+ static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
+ if (inputMode == 0) {
+ // utf8
+ return toIntsRef(new BytesRef(s), ir);
+ } else {
+ // utf32
+ return toIntsRefUTF32(s, ir);
+ }
+ }
+
+ static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
+ final int charLength = s.length();
+ int charIdx = 0;
+ int intIdx = 0;
+ while(charIdx < charLength) {
+ if (intIdx == ir.ints.length) {
+ ir.grow(intIdx+1);
+ }
+ final int utf32 = s.codePointAt(charIdx);
+ ir.ints[intIdx] = utf32;
+ charIdx += Character.charCount(utf32);
+ intIdx++;
+ }
+ ir.length = intIdx;
+ return ir;
+ }
+
+ static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
+ if (br.length > ir.ints.length) {
+ ir.grow(br.length);
+ }
+ for(int i=0;i<br.length;i++) {
+ ir.ints[i] = br.bytes[br.offset+i]&0xFF;
+ }
+ ir.length = br.length;
+ return ir;
+ }
+
+ /** Holds one input/output pair. */
+ public static class InputOutput<T> implements Comparable<InputOutput<T>> {
+ public final IntsRef input;
+ public final T output;
+
+ public InputOutput(IntsRef input, T output) {
+ this.input = input;
+ this.output = output;
+ }
+
+ public int compareTo(InputOutput<T> other) {
+ if (other instanceof InputOutput) {
+ return input.compareTo((other).input);
+ } else {
+ throw new IllegalArgumentException();
+ }
+ }
+ }
+
+ public void doTest(boolean testPruning) throws IOException {
+ // no pruning
+ doTest(0, 0, true);
+
+ if (testPruning) {
+ // simple pruning
+ doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
+
+ // leafy pruning
+ doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
+ }
+ }
+
+ // runs the term, returning the output, or null if term
+ // isn't accepted. if prefixLength is non-null it must be
+ // length 1 int array; prefixLength[0] is set to the length
+ // of the term prefix that matches
+ private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
+ assert prefixLength == null || prefixLength.length == 1;
+ final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+ final T NO_OUTPUT = fst.outputs.getNoOutput();
+ T output = NO_OUTPUT;
+ final FST.BytesReader fstReader = fst.getBytesReader(0);
+
+ for(int i=0;i<=term.length;i++) {
+ final int label;
+ if (i == term.length) {
+ label = FST.END_LABEL;
+ } else {
+ label = term.ints[term.offset+i];
+ }
+ // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
+ if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
+ // System.out.println(" not found");
+ if (prefixLength != null) {
+ prefixLength[0] = i;
+ return output;
+ } else {
+ return null;
+ }
+ }
+ output = fst.outputs.add(output, arc.output);
+ }
+
+ if (prefixLength != null) {
+ prefixLength[0] = term.length;
+ }
+
+ return output;
+ }
+
+ private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
+ FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+ final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
+ in.length = 0;
+ in.offset = 0;
+ final T NO_OUTPUT = fst.outputs.getNoOutput();
+ T output = NO_OUTPUT;
+ final FST.BytesReader fstReader = fst.getBytesReader(0);
+
+ while(true) {
+ // read all arcs:
+ fst.readFirstTargetArc(arc, arc, fstReader);
+ arcs.add(new FST.Arc<T>().copyFrom(arc));
+ while(!arc.isLast()) {
+ fst.readNextArc(arc, fstReader);
+ arcs.add(new FST.Arc<T>().copyFrom(arc));
+ }
+
+ // pick one
+ arc = arcs.get(random.nextInt(arcs.size()));
+ arcs.clear();
+
+ // accumulate output
+ output = fst.outputs.add(output, arc.output);
+
+ // append label
+ if (arc.label == FST.END_LABEL) {
+ break;
+ }
+
+ if (in.ints.length == in.length) {
+ in.grow(1+in.length);
+ }
+ in.ints[in.length++] = arc.label;
+ }
+
+ return output;
+ }
+
+
+ FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
+ }
+
+ final boolean willRewrite = random.nextBoolean();
+
+ final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
+ prune1, prune2,
+ prune1==0 && prune2==0,
+ allowRandomSuffixSharing ? random.nextBoolean() : true,
+ allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
+ outputs,
+ null,
+ willRewrite);
+
+ for(InputOutput<T> pair : pairs) {
+ if (pair.output instanceof List) {
+ @SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
+ @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
+ for(Long value : longValues) {
+ builderObject.add(pair.input, value);
+ }
+ } else {
+ builder.add(pair.input, pair.output);
+ }
+ }
+ FST<T> fst = builder.finish();
+
+ if (random.nextBoolean() && fst != null && !willRewrite) {
+ IOContext context = LuceneTestCase.newIOContext(random);
+ IndexOutput out = dir.createOutput("fst.bin", context);
+ fst.save(out);
+ out.close();
+ IndexInput in = dir.openInput("fst.bin", context);
+ try {
+ fst = new FST<T>(in, outputs);
+ } finally {
+ in.close();
+ dir.deleteFile("fst.bin");
+ }
+ }
+
+ if (LuceneTestCase.VERBOSE && pairs.size() <= 20 && fst != null) {
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ Util.toDot(fst, w, false, false);
+ w.close();
+ System.out.println("SAVED out.dot");
+ }
+
+ if (LuceneTestCase.VERBOSE) {
+ if (fst == null) {
+ System.out.println(" fst has 0 nodes (fully pruned)");
+ } else {
+ System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
+ }
+ }
+
+ if (prune1 == 0 && prune2 == 0) {
+ verifyUnPruned(inputMode, fst);
+ } else {
+ verifyPruned(inputMode, fst, prune1, prune2);
+ }
+
+ if (willRewrite && fst != null) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: now rewrite");
+ }
+ final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: now verify packed FST");
+ }
+ if (prune1 == 0 && prune2 == 0) {
+ verifyUnPruned(inputMode, packed);
+ } else {
+ verifyPruned(inputMode, packed, prune1, prune2);
+ }
+ }
+
+ return fst;
+ }
+
+ protected boolean outputsEqual(T a, T b) {
+ return a.equals(b);
+ }
+
+ // FST is complete
+ private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
+
+ final FST<Long> fstLong;
+ final Set<Long> validOutputs;
+ long minLong = Long.MAX_VALUE;
+ long maxLong = Long.MIN_VALUE;
+
+ if (doReverseLookup) {
+ @SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
+ fstLong = fstLong0;
+ validOutputs = new HashSet<Long>();
+ for(InputOutput<T> pair: pairs) {
+ Long output = (Long) pair.output;
+ maxLong = Math.max(maxLong, output);
+ minLong = Math.min(minLong, output);
+ validOutputs.add(output);
+ }
+ } else {
+ fstLong = null;
+ validOutputs = null;
+ }
+
+ if (pairs.size() == 0) {
+ assertNull(fst);
+ return;
+ }
+
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: now verify " + pairs.size() + " terms");
+ for(InputOutput<T> pair : pairs) {
+ assertNotNull(pair);
+ assertNotNull(pair.input);
+ assertNotNull(pair.output);
+ System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
+ }
+ }
+
+ assertNotNull(fst);
+
+ // visit valid pairs in order -- make sure all words
+ // are accepted, and FSTEnum's next() steps through
+ // them correctly
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: check valid terms/next()");
+ }
+ {
+ IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+ for(InputOutput<T> pair : pairs) {
+ IntsRef term = pair.input;
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
+ }
+ T output = run(fst, term, null);
+ assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
+ assertTrue(outputsEqual(pair.output, output));
+
+ // verify enum's next
+ IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
+ assertNotNull(t);
+ assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
+ assertTrue(outputsEqual(pair.output, t.output));
+ }
+ assertNull(fstEnum.next());
+ }
+
+ final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
+ for(InputOutput<T> pair : pairs) {
+ termsMap.put(pair.input, pair.output);
+ }
+
+ if (doReverseLookup && maxLong > minLong) {
+ // Do random lookups so we test null (output doesn't
+ // exist) case:
+ assertNull(Util.getByOutput(fstLong, minLong-7));
+ assertNull(Util.getByOutput(fstLong, maxLong+7));
+
+ final int num = LuceneTestCase.atLeast(random, 100);
+ for(int iter=0;iter<num;iter++) {
+ Long v = _TestUtil.nextLong(random, minLong, maxLong);
+ IntsRef input = Util.getByOutput(fstLong, v);
+ assertTrue(validOutputs.contains(v) || input == null);
+ }
+ }
+
+ // find random matching word and make sure it's valid
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: verify random accepted terms");
+ }
+ final IntsRef scratch = new IntsRef(10);
+ int num = LuceneTestCase.atLeast(random, 500);
+ for(int iter=0;iter<num;iter++) {
+ T output = randomAcceptedWord(fst, scratch);
+ assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
+ assertTrue(outputsEqual(termsMap.get(scratch), output));
+
+ if (doReverseLookup) {
+ //System.out.println("lookup output=" + output + " outs=" + fst.outputs);
+ IntsRef input = Util.getByOutput(fstLong, (Long) output);
+ assertNotNull(input);
+ //System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
+ assertEquals(scratch, input);
+ }
+ }
+
+ // test IntsRefFSTEnum.seek:
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: verify seek");
+ }
+ IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+ num = LuceneTestCase.atLeast(random, 100);
+ for(int iter=0;iter<num;iter++) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" iter=" + iter);
+ }
+ if (random.nextBoolean()) {
+ // seek to term that doesn't exist:
+ while(true) {
+ final IntsRef term = toIntsRef(getRandomString(random), inputMode);
+ int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
+ if (pos < 0) {
+ pos = -(pos+1);
+ // ok doesn't exist
+ //System.out.println(" seek " + inputToString(inputMode, term));
+ final IntsRefFSTEnum.InputOutput<T> seekResult;
+ if (random.nextInt(3) == 0) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term));
+ }
+ seekResult = fstEnum.seekExact(term);
+ pos = -1;
+ } else if (random.nextBoolean()) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
+ }
+ seekResult = fstEnum.seekFloor(term);
+ pos--;
+ } else {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
+ }
+ seekResult = fstEnum.seekCeil(term);
+ }
+
+ if (pos != -1 && pos < pairs.size()) {
+ //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
+ assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" got " + inputToString(inputMode, seekResult.input));
+ }
+ assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
+ assertTrue(outputsEqual(pairs.get(pos).output, seekResult.output));
+ } else {
+ // seeked before start or beyond end
+ //System.out.println("seek=" + seekTerm);
+ assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" got null");
+ }
+ }
+
+ break;
+ }
+ }
+ } else {
+ // seek to term that does exist:
+ InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
+ final IntsRefFSTEnum.InputOutput<T> seekResult;
+ if (random.nextInt(3) == 2) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input));
+ }
+ seekResult = fstEnum.seekExact(pair.input);
+ } else if (random.nextBoolean()) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
+ }
+ seekResult = fstEnum.seekFloor(pair.input);
+ } else {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
+ }
+ seekResult = fstEnum.seekCeil(pair.input);
+ }
+ assertNotNull(seekResult);
+ assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
+ assertTrue(outputsEqual(pair.output, seekResult.output));
+ }
+ }
+
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: mixed next/seek");
+ }
+
+ // test mixed next/seek
+ num = LuceneTestCase.atLeast(random, 100);
+ for(int iter=0;iter<num;iter++) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: iter " + iter);
+ }
+ // reset:
+ fstEnum = new IntsRefFSTEnum<T>(fst);
+ int upto = -1;
+ while(true) {
+ boolean isDone = false;
+ if (upto == pairs.size()-1 || random.nextBoolean()) {
+ // next
+ upto++;
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do next");
+ }
+ isDone = fstEnum.next() == null;
+ } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
+ int attempt = 0;
+ for(;attempt<10;attempt++) {
+ IntsRef term = toIntsRef(getRandomString(random), inputMode);
+ if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
+ int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
+ assert pos < 0;
+ upto = -(pos+1);
+
+ if (random.nextBoolean()) {
+ upto--;
+ assertTrue(upto != -1);
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
+ }
+ isDone = fstEnum.seekFloor(term) == null;
+ } else {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
+ }
+ isDone = fstEnum.seekCeil(term) == null;
+ }
+
+ break;
+ }
+ }
+ if (attempt == 10) {
+ continue;
+ }
+
+ } else {
+ final int inc = random.nextInt(pairs.size() - upto - 1);
+ upto += inc;
+ if (upto == -1) {
+ upto = 0;
+ }
+
+ if (random.nextBoolean()) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
+ }
+ isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
+ } else {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
+ }
+ isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
+ }
+ }
+ if (LuceneTestCase.VERBOSE) {
+ if (!isDone) {
+ System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
+ } else {
+ System.out.println(" got null");
+ }
+ }
+
+ if (upto == pairs.size()) {
+ assertTrue(isDone);
+ break;
+ } else {
+ assertFalse(isDone);
+ assertEquals(pairs.get(upto).input, fstEnum.current().input);
+ assertTrue(outputsEqual(pairs.get(upto).output, fstEnum.current().output));
+
+ /*
+ if (upto < pairs.size()-1) {
+ int tryCount = 0;
+ while(tryCount < 10) {
+ final IntsRef t = toIntsRef(getRandomString(), inputMode);
+ if (pairs.get(upto).input.compareTo(t) < 0) {
+ final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
+ }
+ assertEquals(expected, fstEnum.beforeNext(t));
+ break;
+ }
+ tryCount++;
+ }
+ }
+ */
+ }
+ }
+ }
+ }
+
+ private static class CountMinOutput<T> {
+ int count;
+ T output;
+ T finalOutput;
+ boolean isLeaf = true;
+ boolean isFinal;
+ }
+
+ // FST is pruned
+ private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
+
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
+ for(InputOutput<T> pair : pairs) {
+ System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
+ }
+ }
+
+ // To validate the FST, we brute-force compute all prefixes
+ // in the terms, matched to their "common" outputs, prune that
+ // set according to the prune thresholds, then assert the FST
+ // matches that same set.
+
+ // NOTE: Crazy RAM intensive!!
+
+ //System.out.println("TEST: tally prefixes");
+
+ // build all prefixes
+ final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
+ final IntsRef scratch = new IntsRef(10);
+ for(InputOutput<T> pair: pairs) {
+ scratch.copyInts(pair.input);
+ for(int idx=0;idx<=pair.input.length;idx++) {
+ scratch.length = idx;
+ CountMinOutput<T> cmo = prefixes.get(scratch);
+ if (cmo == null) {
+ cmo = new CountMinOutput<T>();
+ cmo.count = 1;
+ cmo.output = pair.output;
+ prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
+ } else {
+ cmo.count++;
+ T output1 = cmo.output;
+ if (output1.equals(outputs.getNoOutput())) {
+ output1 = outputs.getNoOutput();
+ }
+ T output2 = pair.output;
+ if (output2.equals(outputs.getNoOutput())) {
+ output2 = outputs.getNoOutput();
+ }
+ cmo.output = outputs.common(output1, output2);
+ }
+ if (idx == pair.input.length) {
+ cmo.isFinal = true;
+ cmo.finalOutput = cmo.output;
+ }
+ }
+ }
+
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: now prune");
+ }
+
+ // prune 'em
+ final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
+ while(it.hasNext()) {
+ Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
+ final IntsRef prefix = ent.getKey();
+ final CountMinOutput<T> cmo = ent.getValue();
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
+ }
+ final boolean keep;
+ if (prune1 > 0) {
+ keep = cmo.count >= prune1;
+ } else {
+ assert prune2 > 0;
+ if (prune2 > 1 && cmo.count >= prune2) {
+ keep = true;
+ } else if (prefix.length > 0) {
+ // consult our parent
+ scratch.length = prefix.length-1;
+ System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
+ final CountMinOutput<T> cmo2 = prefixes.get(scratch);
+ //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
+ keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
+ } else if (cmo.count >= prune2) {
+ keep = true;
+ } else {
+ keep = false;
+ }
+ }
+
+ if (!keep) {
+ it.remove();
+ //System.out.println(" remove");
+ } else {
+ // clear isLeaf for all ancestors
+ //System.out.println(" keep");
+ scratch.copyInts(prefix);
+ scratch.length--;
+ while(scratch.length >= 0) {
+ final CountMinOutput<T> cmo2 = prefixes.get(scratch);
+ if (cmo2 != null) {
+ //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
+ cmo2.isLeaf = false;
+ }
+ scratch.length--;
+ }
+ }
+ }
+
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: after prune");
+ for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
+ System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
+ if (ent.getValue().isFinal) {
+ System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
+ }
+ }
+ }
+
+ if (prefixes.size() <= 1) {
+ assertNull(fst);
+ return;
+ }
+
+ assertNotNull(fst);
+
+ // make sure FST only enums valid prefixes
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: check pruned enum");
+ }
+ IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+ IntsRefFSTEnum.InputOutput<T> current;
+ while((current = fstEnum.next()) != null) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
+ }
+ final CountMinOutput<T> cmo = prefixes.get(current.input);
+ assertNotNull(cmo);
+ assertTrue(cmo.isLeaf || cmo.isFinal);
+ //if (cmo.isFinal && !cmo.isLeaf) {
+ if (cmo.isFinal) {
+ assertEquals(cmo.finalOutput, current.output);
+ } else {
+ assertEquals(cmo.output, current.output);
+ }
+ }
+
+ // make sure all non-pruned prefixes are present in the FST
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: verify all prefixes");
+ }
+ final int[] stopNode = new int[1];
+ for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
+ if (ent.getKey().length > 0) {
+ final CountMinOutput<T> cmo = ent.getValue();
+ final T output = run(fst, ent.getKey(), stopNode);
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
+ }
+ // if (cmo.isFinal && !cmo.isLeaf) {
+ if (cmo.isFinal) {
+ assertEquals(cmo.finalOutput, output);
+ } else {
+ assertEquals(cmo.output, output);
+ }
+ assertEquals(ent.getKey().length, stopNode[0]);
+ }
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/package.html?rev=1388935&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/package.html (added)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/fst/package.html Sun Sep 23 00:52:36 2012
@@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Support for FST testing.
+</body>
+</html>