You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/16 00:42:03 UTC
svn commit: r1231796 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/
lucene/src/java/org/apache/lucene/util/fst/
lucene/src/test/org/apache/lucene/index/ lucene/src/test/org/a...
Author: mikemccand
Date: Sun Jan 15 23:42:02 2012
New Revision: 1231796
URL: http://svn.apache.org/viewvc?rev=1231796&view=rev
Log:
LUCENE-3695: move some confusing FST sugar out
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Sun Jan 15 23:42:02 2012
@@ -47,6 +47,9 @@ Changes in backwards compatibility polic
has been removed and replaced with the experimental getFieldInfos
API. All IndexReader subclasses must implement getFieldInfos.
(Mike McCandless)
+
+* LUCENE-3695: Move confusing add(X) methods out of FST.Builder into
+ FST.Util. (Robert Muir, Mike McCandless)
Security fixes
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java Sun Jan 15 23:42:02 2012
@@ -33,9 +33,11 @@ import org.apache.lucene.store.ByteArray
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
/**
* A map of synonyms, keys and values are phrases.
@@ -263,6 +265,8 @@ public class SynonymMap {
CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
+ final IntsRef scratchIntsRef = new IntsRef();
+
//System.out.println("fmap.build");
for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
CharsRef input = sortedKeys[keyIdx];
@@ -307,7 +311,7 @@ public class SynonymMap {
scratch.length = scratchOutput.getPosition() - scratch.offset;
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
- builder.add(input, BytesRef.deepCopyOf(scratch));
+ builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
}
FST<BytesRef> fst = builder.finish();
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java Sun Jan 15 23:42:02 2012
@@ -19,7 +19,6 @@ package org.apache.lucene.util.fst;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
@@ -272,54 +271,6 @@ public class Builder<T> {
}
}
- private final IntsRef scratchIntsRef = new IntsRef(10);
-
- public void add(BytesRef input, T output) throws IOException {
- assert fst.getInputType() == FST.INPUT_TYPE.BYTE1;
- scratchIntsRef.grow(input.length);
- for(int i=0;i<input.length;i++) {
- scratchIntsRef.ints[i] = input.bytes[i+input.offset] & 0xFF;
- }
- scratchIntsRef.length = input.length;
- add(scratchIntsRef, output);
- }
-
- /** Sugar: adds the UTF32 codepoints from char[] slice. FST
- * must be FST.INPUT_TYPE.BYTE4! */
- public void add(char[] s, int offset, int length, T output) throws IOException {
- assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
- int charIdx = offset;
- int intIdx = 0;
- final int charLimit = offset + length;
- while(charIdx < charLimit) {
- scratchIntsRef.grow(intIdx+1);
- final int utf32 = Character.codePointAt(s, charIdx);
- scratchIntsRef.ints[intIdx] = utf32;
- charIdx += Character.charCount(utf32);
- intIdx++;
- }
- scratchIntsRef.length = intIdx;
- add(scratchIntsRef, output);
- }
-
- /** Sugar: adds the UTF32 codepoints from CharSequence. FST
- * must be FST.INPUT_TYPE.BYTE4! */
- public void add(CharSequence s, T output) throws IOException {
- assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
- int charIdx = 0;
- int intIdx = 0;
- final int charLimit = s.length();
- while(charIdx < charLimit) {
- scratchIntsRef.grow(intIdx+1);
- final int utf32 = Character.codePointAt(s, charIdx);
- scratchIntsRef.ints[intIdx] = utf32;
- charIdx += Character.charCount(utf32);
- intIdx++;
- }
- scratchIntsRef.length = intIdx;
- add(scratchIntsRef, output);
- }
-
/** It's OK to add the same input twice in a row with
* different outputs, as long as outputs impls the merge
* method. */
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java Sun Jan 15 23:42:02 2012
@@ -31,10 +31,8 @@ public final class Util {
}
/** Looks up the output for this input, or null if the
- * input is not accepted. FST must be
- * INPUT_TYPE.BYTE4. */
+ * input is not accepted. */
public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE4;
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
@@ -59,77 +57,7 @@ public final class Util {
}
}
- /** Logically casts input to UTF32 ints then looks up the output
- * or null if the input is not accepted. FST must be
- * INPUT_TYPE.BYTE4. */
- public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-
- // TODO: would be nice not to alloc this on every lookup
- final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
- int charIdx = offset;
- final int charLimit = offset + length;
-
- // Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
- while(charIdx < charLimit) {
- final int utf32 = Character.codePointAt(input, charIdx);
- charIdx += Character.charCount(utf32);
-
- if (fst.findTargetArc(utf32, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
- }
- }
-
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
- } else {
- return output;
- }
- }
-
-
- /** Logically casts input to UTF32 ints then looks up the output
- * or null if the input is not accepted. FST must be
- * INPUT_TYPE.BYTE4. */
- public static<T> T get(FST<T> fst, CharSequence input) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-
- // TODO: would be nice not to alloc this on every lookup
- final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
- int charIdx = 0;
- final int charLimit = input.length();
-
- // Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
-
- while(charIdx < charLimit) {
- final int utf32 = Character.codePointAt(input, charIdx);
- charIdx += Character.charCount(utf32);
-
- if (fst.findTargetArc(utf32, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
- }
- }
-
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
- } else {
- return output;
- }
- }
+ // TODO: maybe a CharsRef version for BYTE2
/** Looks up the output for this input, or null if the
* input is not accepted */
@@ -381,4 +309,51 @@ public final class Util {
return "0x" + Integer.toHexString(label);
}
}
+
+ /** Decodes the Unicode codepoints from the provided
+ * CharSequence and places them in the provided scratch
+ * IntsRef, which must not be null, returning it. */
+ public static IntsRef toUTF32(CharSequence s, IntsRef scratch) {
+ int charIdx = 0;
+ int intIdx = 0;
+ final int charLimit = s.length();
+ while(charIdx < charLimit) {
+ scratch.grow(intIdx+1);
+ final int utf32 = Character.codePointAt(s, charIdx);
+ scratch.ints[intIdx] = utf32;
+ charIdx += Character.charCount(utf32);
+ intIdx++;
+ }
+ scratch.length = intIdx;
+ return scratch;
+ }
+
+ /** Decodes the Unicode codepoints from the provided
+ * char[] and places them in the provided scratch
+ * IntsRef, which must not be null, returning it. */
+ public static IntsRef toUTF32(char[] s, int offset, int length, IntsRef scratch) {
+ int charIdx = offset;
+ int intIdx = 0;
+ final int charLimit = offset + length;
+ while(charIdx < charLimit) {
+ scratch.grow(intIdx+1);
+ final int utf32 = Character.codePointAt(s, charIdx);
+ scratch.ints[intIdx] = utf32;
+ charIdx += Character.charCount(utf32);
+ intIdx++;
+ }
+ scratch.length = intIdx;
+ return scratch;
+ }
+
+ /** Just takes unsigned byte values from the BytesRef and
+ * converts into an IntsRef. */
+ public static IntsRef toIntsRef(BytesRef input, IntsRef scratch) {
+ scratch.grow(input.length);
+ for(int i=0;i<input.length;i++) {
+ scratch.ints[i] = input.bytes[i+input.offset] & 0xFF;
+ }
+ scratch.length = input.length;
+ return scratch;
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java Sun Jan 15 23:42:02 2012
@@ -19,29 +19,30 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
-import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.ThreadInterruptedException;
-import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.lucene.util._TestUtil;
public class TestIndexWriterReader extends LuceneTestCase {
static PrintStream infoStream = VERBOSE ? System.out : null;
@@ -787,7 +788,8 @@ public class TestIndexWriterReader exten
assertEquals(0, excs.size());
r.close();
- assertEquals(0, dir1.getOpenDeletedFiles().size());
+ final Collection<String> openDeletedFiles = dir1.getOpenDeletedFiles();
+ assertEquals("openDeleted=" + openDeletedFiles, 0, openDeletedFiles.size());
writer.close();
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Sun Jan 15 23:42:02 2012
@@ -24,6 +24,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
+import java.io.StringWriter;
import java.io.Writer;
import java.util.*;
@@ -1045,6 +1046,7 @@ public class TestFSTs extends LuceneTest
System.out.println("FST stores docFreq");
}
}
+ final IntsRef scratchIntsRef = new IntsRef();
TermEnum termEnum = r.terms(new Term("body", ""));
if (VERBOSE) {
System.out.println("TEST: got termEnum=" + termEnum);
@@ -1380,7 +1382,7 @@ public class TestFSTs extends LuceneTest
public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
- b.add(new BytesRef("foobar"), outputs.getNoOutput());
+ b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRef()), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
@@ -1402,9 +1404,9 @@ public class TestFSTs extends LuceneTest
final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c");
- builder.add(a, outputs.get(17));
- builder.add(b, outputs.get(42));
- builder.add(c, outputs.get(13824324872317238L));
+ builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17));
+ builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42));
+ builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L));
final FST<Long> fst = builder.finish();
@@ -1628,13 +1630,14 @@ public class TestFSTs extends LuceneTest
int line = 0;
final BytesRef term = new BytesRef();
+ final IntsRef scratchIntsRef = new IntsRef();
while (line < lines.length) {
String w = lines[line++];
if (w == null) {
break;
}
term.copyChars(w);
- b.add(term, nothing);
+ b.add(Util.toIntsRef(term, scratchIntsRef), nothing);
}
return b.finish();
@@ -1694,6 +1697,36 @@ public class TestFSTs extends LuceneTest
s.verifyStateAndBelow(fst, arc, 1);
}
+ public void testFinalOutputOnEndState() throws Exception {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs);
+ builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17));
+ builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10));
+ final FST<Long> fst = builder.finish();
+ //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+ StringWriter w = new StringWriter();
+ Util.toDot(fst, w, false, false);
+ w.close();
+ //System.out.println(w.toString());
+ assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1);
+ }
+
+ public void testInternalFinalState() throws Exception {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs);
+ builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
+ builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
+ final FST<Long> fst = builder.finish();
+ StringWriter w = new StringWriter();
+ //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+ Util.toDot(fst, w, false, false);
+ w.close();
+ //System.out.println(w.toString());
+ assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
+ }
+
// Make sure raw FST can differentiate between final vs
// non-final end nodes
public void testNonFinalStopNodes() throws Exception {