You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/02/01 11:06:53 UTC
svn commit: r1441367 - in /lucene/dev/trunk/lucene:
codecs/src/test/org/apache/lucene/codecs/simpletext/
core/src/test/org/apache/lucene/codecs/compressing/
core/src/test/org/apache/lucene/codecs/lucene40/
core/src/test/org/apache/lucene/index/ core/sr...
Author: jpountz
Date: Fri Feb 1 10:06:53 2013
New Revision: 1441367
URL: http://svn.apache.org/viewvc?rev=1441367&view=rev
Log:
LUCENE-4733: Refactor term vectors formats tests around a BaseTermVectorsFormatTestCase.
Added:
lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java (with props)
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java (with props)
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java (with props)
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java (with props)
Modified:
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java
Added: lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java?rev=1441367&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java Fri Feb 1 10:06:53 2013
@@ -0,0 +1,30 @@
+package org.apache.lucene.codecs.simpletext;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
+
+public class TestSimpleTextTermVectorsFormat extends BaseTermVectorsFormatTestCase {
+
+ @Override
+ protected Codec getCodec() {
+ return new SimpleTextCodec();
+ }
+
+}
Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java?rev=1441367&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java Fri Feb 1 10:06:53 2013
@@ -0,0 +1,30 @@
+package org.apache.lucene.codecs.compressing;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestCase {
+
+ @Override
+ protected Codec getCodec() {
+ return CompressingCodec.randomInstance(random());
+ }
+
+}
Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java?rev=1441367&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java Fri Feb 1 10:06:53 2013
@@ -0,0 +1,30 @@
+package org.apache.lucene.codecs.lucene40;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
+
+public class TestLucene40TermVectorsFormat extends BaseTermVectorsFormatTestCase {
+
+ @Override
+ protected Codec getCodec() {
+ return new Lucene40Codec();
+ }
+
+}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java?rev=1441367&r1=1441366&r2=1441367&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java Fri Feb 1 10:06:53 2013
@@ -17,38 +17,21 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import java.io.IOException;
import java.io.StringReader;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.IntField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
-
-import com.carrotsearch.randomizedtesting.generators.RandomInts;
-import com.carrotsearch.randomizedtesting.generators.RandomPicks;
public class TestPayloadsOnVectors extends LuceneTestCase {
@@ -158,314 +141,5 @@ public class TestPayloadsOnVectors exten
writer.close();
dir.close();
}
-
- // custom impl to test cases that are forbidden by the default OffsetAttribute impl
- static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {
-
- int start, end;
-
- @Override
- public int startOffset() {
- return start;
- }
-
- @Override
- public int endOffset() {
- return end;
- }
-
- @Override
- public void setOffset(int startOffset, int endOffset) {
- // no check!
- start = startOffset;
- end = endOffset;
- }
-
- @Override
- public void clear() {
- start = end = 0;
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- }
-
- if (other instanceof PermissiveOffsetAttributeImpl) {
- PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other;
- return o.start == start && o.end == end;
- }
-
- return false;
- }
-
- @Override
- public int hashCode() {
- return start + 31 * end;
- }
-
- @Override
- public void copyTo(AttributeImpl target) {
- OffsetAttribute t = (OffsetAttribute) target;
- t.setOffset(start, end);
- }
-
- }
-
- static BytesRef randomPayload() {
- final int len = random().nextInt(5);
- if (len == 0) {
- return null;
- }
- final BytesRef payload = new BytesRef(len);
- random().nextBytes(payload.bytes);
- payload.length = len;
- return payload;
- }
-
- class RandomTokenStream extends TokenStream {
-
- final String[] terms;
- final int[] positionsIncrements;
- final int[] positions;
- final int[] startOffsets, endOffsets;
- final BytesRef[] payloads;
-
- final Map<Integer, Set<Integer>> positionToTerms;
- final Map<Integer, Set<Integer>> startOffsetToTerms;
- final CharTermAttribute termAtt;
- final PositionIncrementAttribute piAtt;
- final OffsetAttribute oAtt;
- final PayloadAttribute pAtt;
- int i = 0;
-
- RandomTokenStream(int len, String[] sampleTerms, boolean weird) {
- terms = new String[len];
- positionsIncrements = new int[len];
- positions = new int[len];
- startOffsets = new int[len];
- endOffsets = new int[len];
- payloads = new BytesRef[len];
- for (int i = 0; i < len; ++i) {
- terms[i] = RandomPicks.randomFrom(random(), sampleTerms);
- if (weird) {
- positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 18);
- startOffsets[i] = random().nextInt();
- endOffsets[i] = random().nextInt();
- } else if (i == 0) {
- positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 5);
- startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16);
- endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
- } else {
- positionsIncrements[i] = _TestUtil.nextInt(random(), 0, 1 << 5);
- startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, 1 << 16);
- endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
- }
- }
- for (int i = 0; i < len; ++i) {
- if (i == 0) {
- positions[i] = positionsIncrements[i] - 1;
- } else {
- positions[i] = positions[i - 1] + positionsIncrements[i];
- }
- }
- if (rarely()) {
- Arrays.fill(payloads, randomPayload());
- } else {
- for (int i = 0; i < len; ++i) {
- payloads[i] = randomPayload();
- }
- }
-
- positionToTerms = new HashMap<Integer, Set<Integer>>();
- startOffsetToTerms = new HashMap<Integer, Set<Integer>>();
- for (int i = 0; i < len; ++i) {
- if (!positionToTerms.containsKey(positions[i])) {
- positionToTerms.put(positions[i], new HashSet<Integer>(1));
- }
- positionToTerms.get(positions[i]).add(i);
- if (!startOffsetToTerms.containsKey(startOffsets[i])) {
- startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
- }
- startOffsetToTerms.get(startOffsets[i]).add(i);
- }
-
- addAttributeImpl(new PermissiveOffsetAttributeImpl());
-
- termAtt = addAttribute(CharTermAttribute.class);
- piAtt = addAttribute(PositionIncrementAttribute.class);
- oAtt = addAttribute(OffsetAttribute.class);
- pAtt = addAttribute(PayloadAttribute.class);
- }
-
- @Override
- public final boolean incrementToken() throws IOException {
- if (i < terms.length) {
- termAtt.setLength(0).append(terms[i]);
- piAtt.setPositionIncrement(positionsIncrements[i]);
- oAtt.setOffset(startOffsets[i], endOffsets[i]);
- pAtt.setPayload(payloads[i]);
- ++i;
- return true;
- } else {
- return false;
- }
- }
-
- }
-
- static FieldType randomFieldType() {
- FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
- ft.setStoreTermVectors(true);
- ft.setStoreTermVectorPositions(random().nextBoolean());
- ft.setStoreTermVectorOffsets(random().nextBoolean());
- if (random().nextBoolean()) {
- ft.setStoreTermVectorPositions(true);
- ft.setStoreTermVectorPayloads(true);
- }
- ft.freeze();
- return ft;
- }
-
- public void testRandomVectors() throws IOException {
- Directory dir = newDirectory();
- IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
- iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
- RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
- String[] sampleTerms = new String[RandomInts.randomIntBetween(random(), 20, 50)];
- for (int i = 0; i < sampleTerms.length; ++i) {
- sampleTerms[i] = _TestUtil.randomUnicodeString(random());
- }
- FieldType ft = randomFieldType();
- // generate random documents and index them
- final String[] fieldNames = new String[_TestUtil.nextInt(random(), 1, 200)];
- for (int i = 0; i < fieldNames.length; ++i) {
- String fieldName;
- do {
- fieldName = _TestUtil.randomSimpleString(random());
- } while ("id".equals(fieldName));
- fieldNames[i] = fieldName;
- }
- final int numDocs = _TestUtil.nextInt(random(), 10, 100);
- @SuppressWarnings("unchecked")
- final Map<String, RandomTokenStream>[] fieldValues = new Map[numDocs];
- for (int i = 0; i < numDocs; ++i) {
- fieldValues[i] = new HashMap<String, RandomTokenStream>();
- final int numFields = _TestUtil.nextInt(random(), 0, rarely() ? fieldNames.length : 5);
- for (int j = 0; j < numFields; ++j) {
- final String fieldName = fieldNames[(i+j*31) % fieldNames.length];
- final int tokenStreamLen = _TestUtil.nextInt(random(), 1, rarely() ? 300 : 5);
- fieldValues[i].put(fieldName, new RandomTokenStream(tokenStreamLen, sampleTerms, rarely()));
- }
- }
-
- // index them
- for (int i = 0; i < numDocs; ++i) {
- Document doc = new Document();
- doc.add(new IntField("id", i, Store.YES));
- for (Map.Entry<String, RandomTokenStream> entry : fieldValues[i].entrySet()) {
- doc.add(new Field(entry.getKey(), entry.getValue(), ft));
- }
- iw.addDocument(doc);
- }
-
- iw.commit();
- // make sure the format can merge
- iw.forceMerge(2);
-
- // read term vectors
- final DirectoryReader reader = DirectoryReader.open(dir);
- for (int i = 0; i < 100; ++i) {
- final int docID = random().nextInt(numDocs);
- final Map<String, RandomTokenStream> fvs = fieldValues[reader.document(docID).getField("id").numericValue().intValue()];
- final Fields fields = reader.getTermVectors(docID);
- if (fvs.isEmpty()) {
- assertNull(fields);
- } else {
- Set<String> fns = new HashSet<String>();
- for (String field : fields) {
- fns.add(field);
- }
- assertEquals(fields.size(), fns.size());
- assertEquals(fvs.keySet(), fns);
- for (String field : fields) {
- final RandomTokenStream tk = fvs.get(field);
- assert tk != null;
- final Terms terms = fields.terms(field);
- assertEquals(ft.storeTermVectorPositions(), terms.hasPositions());
- assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets());
- assertEquals(1, terms.getDocCount());
- final TermsEnum termsEnum = terms.iterator(null);
- while (termsEnum.next() != null) {
- assertEquals(1, termsEnum.docFreq());
- final DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
- final DocsEnum docsEnum = docsAndPositionsEnum == null ? termsEnum.docs(null, null) : docsAndPositionsEnum;
- if (ft.storeTermVectorOffsets() || ft.storeTermVectorPositions()) {
- assertNotNull(docsAndPositionsEnum);
- }
- assertEquals(0, docsEnum.nextDoc());
- if (terms.hasPositions() || terms.hasOffsets()) {
- final int freq = docsEnum.freq();
- assertTrue(freq >= 1);
- if (docsAndPositionsEnum != null) {
- for (int k = 0; k < freq; ++k) {
- final int position = docsAndPositionsEnum.nextPosition();
- final Set<Integer> indexes;
- if (terms.hasPositions()) {
- indexes = tk.positionToTerms.get(position);
- assertNotNull(tk.positionToTerms.keySet().toString() + " does not contain " + position, indexes);
- } else {
- indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset());
- assertNotNull(indexes);
- }
- if (terms.hasPositions()) {
- boolean foundPosition = false;
- for (int index : indexes) {
- if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.positions[index] == position) {
- foundPosition = true;
- break;
- }
- }
- assertTrue(foundPosition);
- }
- if (terms.hasOffsets()) {
- boolean foundOffset = false;
- for (int index : indexes) {
- if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) {
- foundOffset = true;
- break;
- }
- }
- assertTrue(foundOffset);
- }
- if (terms.hasPayloads()) {
- boolean foundPayload = false;
- for (int index : indexes) {
- if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) {
- foundPayload = true;
- break;
- }
- }
- assertTrue(foundPayload);
- }
- }
- }
- }
- assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
- }
- }
- }
- }
- IOUtils.close(reader, iw, dir);
- }
-
- private static boolean equals(Object o1, Object o2) {
- if (o1 == null) {
- return o2 == null;
- } else {
- return o1.equals(o2);
- }
- }
}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java?rev=1441367&r1=1441366&r2=1441367&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java Fri Feb 1 10:06:53 2013
@@ -18,9 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
@@ -28,19 +25,24 @@ import org.apache.lucene.document.Docume
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.*;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestTermVectors extends LuceneTestCase {
- private static IndexSearcher searcher;
private static IndexReader reader;
private static Directory directory;
@@ -75,7 +77,6 @@ public class TestTermVectors extends Luc
}
reader = writer.getReader();
writer.close();
- searcher = newSearcher(reader);
}
@AfterClass
@@ -84,300 +85,8 @@ public class TestTermVectors extends Luc
directory.close();
reader = null;
directory = null;
- searcher = null;
}
- public void test() {
- assertTrue(searcher != null);
- }
-
- public void testTermVectors() throws IOException {
- Query query = new TermQuery(new Term("field", "seventy"));
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(100, hits.length);
-
- for (int i = 0; i < hits.length; i++) {
- Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
- assertNotNull(vectors);
- assertEquals("doc=" + hits[i].doc + " tv=" + vectors, 1, vectors.size());
- }
- Terms vector;
- vector = searcher.reader.getTermVectors(hits[0].doc).terms("noTV");
- assertNull(vector);
- }
-
- public void testTermVectorsFieldOrder() throws IOException {
- Directory dir = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
- Document doc = new Document();
- FieldType ft = new FieldType(TextField.TYPE_STORED);
- ft.setStoreTermVectors(true);
- ft.setStoreTermVectorOffsets(true);
- ft.setStoreTermVectorPositions(true);
- doc.add(newField("c", "some content here", ft));
- doc.add(newField("a", "some content here", ft));
- doc.add(newField("b", "some content here", ft));
- doc.add(newField("x", "some content here", ft));
- writer.addDocument(doc);
- IndexReader reader = writer.getReader();
- writer.close();
- Fields v = reader.getTermVectors(0);
- assertEquals(4, v.size());
- String[] expectedFields = new String[]{"a", "b", "c", "x"};
- int[] expectedPositions = new int[]{1, 2, 0};
- Iterator<String> fieldsEnum = v.iterator();
- for(int i=0;i<expectedFields.length;i++) {
- assertEquals(expectedFields[i], fieldsEnum.next());
- assertEquals(3, v.terms(expectedFields[i]).size());
-
- DocsAndPositionsEnum dpEnum = null;
- Terms terms = v.terms(expectedFields[i]);
- assertNotNull(terms);
- TermsEnum termsEnum = terms.iterator(null);
- assertEquals("content", termsEnum.next().utf8ToString());
- dpEnum = termsEnum.docsAndPositions(null, dpEnum);
- assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
- assertEquals(1, dpEnum.freq());
- assertEquals(expectedPositions[0], dpEnum.nextPosition());
-
- assertEquals("here", termsEnum.next().utf8ToString());
- dpEnum = termsEnum.docsAndPositions(null, dpEnum);
- assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
- assertEquals(1, dpEnum.freq());
- assertEquals(expectedPositions[1], dpEnum.nextPosition());
-
- assertEquals("some", termsEnum.next().utf8ToString());
- dpEnum = termsEnum.docsAndPositions(null, dpEnum);
- assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
- assertEquals(1, dpEnum.freq());
- assertEquals(expectedPositions[2], dpEnum.nextPosition());
-
- assertNull(termsEnum.next());
- }
- reader.close();
- dir.close();
- }
-
- public void testTermPositionVectors() throws IOException {
- Query query = new TermQuery(new Term("field", "zero"));
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
-
- DocsAndPositionsEnum dpEnum = null;
- for (int i = 0; i < hits.length; i++) {
- Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
- assertNotNull(vectors);
- assertEquals(1, vectors.size());
-
- TermsEnum termsEnum = vectors.terms("field").iterator(null);
- assertNotNull(termsEnum.next());
-
- boolean shouldBePosVector = hits[i].doc % 2 == 0;
- boolean shouldBeOffVector = hits[i].doc % 3 == 0;
-
- if (shouldBePosVector || shouldBeOffVector) {
- while(true) {
- dpEnum = termsEnum.docsAndPositions(null, dpEnum);
- assertNotNull(dpEnum);
- assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
-
- dpEnum.nextPosition();
-
- if (shouldBeOffVector) {
- assertTrue(dpEnum.startOffset() != -1);
- assertTrue(dpEnum.endOffset() != -1);
- }
-
- if (termsEnum.next() == null) {
- break;
- }
- }
- } else {
- fail();
- }
- }
- }
-
- public void testTermOffsetVectors() throws IOException {
- Query query = new TermQuery(new Term("field", "fifty"));
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(100, hits.length);
-
- for (int i = 0; i < hits.length; i++) {
- Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
- assertNotNull(vectors);
- assertEquals(1, vectors.size());
- }
- }
-
- public void testKnownSetOfDocuments() throws IOException {
- String test1 = "eating chocolate in a computer lab"; //6 terms
- String test2 = "computer in a computer lab"; //5 terms
- String test3 = "a chocolate lab grows old"; //5 terms
- String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
- Map<String,Integer> test4Map = new HashMap<String,Integer>();
- test4Map.put("chocolate", Integer.valueOf(3));
- test4Map.put("lab", Integer.valueOf(2));
- test4Map.put("eating", Integer.valueOf(1));
- test4Map.put("computer", Integer.valueOf(1));
- test4Map.put("with", Integer.valueOf(1));
- test4Map.put("a", Integer.valueOf(1));
- test4Map.put("colored", Integer.valueOf(1));
- test4Map.put("in", Integer.valueOf(1));
- test4Map.put("an", Integer.valueOf(1));
- test4Map.put("computer", Integer.valueOf(1));
- test4Map.put("old", Integer.valueOf(1));
-
- Document testDoc1 = new Document();
- setupDoc(testDoc1, test1);
- Document testDoc2 = new Document();
- setupDoc(testDoc2, test2);
- Document testDoc3 = new Document();
- setupDoc(testDoc3, test3);
- Document testDoc4 = new Document();
- setupDoc(testDoc4, test4);
-
- Directory dir = newDirectory();
-
- RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
- .setOpenMode(OpenMode.CREATE)
- .setMergePolicy(newLogMergePolicy())
- .setSimilarity(new DefaultSimilarity()));
- writer.addDocument(testDoc1);
- writer.addDocument(testDoc2);
- writer.addDocument(testDoc3);
- writer.addDocument(testDoc4);
- IndexReader reader = writer.getReader();
- writer.close();
- IndexSearcher knownSearcher = newSearcher(reader);
- knownSearcher.setSimilarity(new DefaultSimilarity());
- Fields fields = MultiFields.getFields(knownSearcher.reader);
-
- DocsEnum docs = null;
- for (String fieldName : fields) {
- Terms terms = fields.terms(fieldName);
- assertNotNull(terms); // NOTE: kinda sketchy assumptions, but ideally we would fix fieldsenum api...
- TermsEnum termsEnum = terms.iterator(null);
-
- while (termsEnum.next() != null) {
- String text = termsEnum.term().utf8ToString();
- docs = _TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(knownSearcher.reader), docs, DocsEnum.FLAG_FREQS);
-
- while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
- int docId = docs.docID();
- int freq = docs.freq();
- //System.out.println("Doc Id: " + docId + " freq " + freq);
- Terms vector = knownSearcher.reader.getTermVectors(docId).terms("field");
- //float tf = sim.tf(freq);
- //float idf = sim.idf(knownSearcher.docFreq(term), knownSearcher.maxDoc());
- //float qNorm = sim.queryNorm()
- //This is fine since we don't have stop words
- //float lNorm = sim.lengthNorm("field", vector.getTerms().length);
- //float coord = sim.coord()
- //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
- assertNotNull(vector);
- TermsEnum termsEnum2 = vector.iterator(null);
-
- while(termsEnum2.next() != null) {
- if (text.equals(termsEnum2.term().utf8ToString())) {
- assertEquals(freq, termsEnum2.totalTermFreq());
- }
- }
- }
- }
- //System.out.println("--------");
- }
- Query query = new TermQuery(new Term("field", "chocolate"));
- ScoreDoc[] hits = knownSearcher.search(query, null, 1000).scoreDocs;
- //doc 3 should be the first hit b/c it is the shortest match
- assertTrue(hits.length == 3);
- /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
- System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
- System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
- System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
- System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
- System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
- assertTrue(hits[0].doc == 2);
- assertTrue(hits[1].doc == 3);
- assertTrue(hits[2].doc == 0);
- Terms vector = knownSearcher.reader.getTermVectors(hits[1].doc).terms("field");
- assertNotNull(vector);
- //System.out.println("Vector: " + vector);
- assertEquals(10, vector.size());
- TermsEnum termsEnum = vector.iterator(null);
- while(termsEnum.next() != null) {
- String term = termsEnum.term().utf8ToString();
- //System.out.println("Term: " + term);
- int freq = (int) termsEnum.totalTermFreq();
- assertTrue(test4.indexOf(term) != -1);
- Integer freqInt = test4Map.get(term);
- assertTrue(freqInt != null);
- assertEquals(freqInt.intValue(), freq);
- }
- reader.close();
- dir.close();
- }
-
- private void setupDoc(Document doc, String text)
- {
- FieldType ft = new FieldType(TextField.TYPE_STORED);
- ft.setStoreTermVectors(true);
- ft.setStoreTermVectorOffsets(true);
- ft.setStoreTermVectorPositions(true);
- FieldType ft2 = new FieldType(TextField.TYPE_STORED);
- ft2.setStoreTermVectors(true);
- doc.add(newField("field2", text, ft));
- doc.add(newField("field", text, ft2));
- //System.out.println("Document: " + doc);
- }
-
- // Test only a few docs having vectors
- public void testRareVectors() throws IOException {
- RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
- .setOpenMode(OpenMode.CREATE));
- if (VERBOSE) {
- System.out.println("TEST: now add non-vectors");
- }
- for (int i = 0; i < 100; i++) {
- Document doc = new Document();
- doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES));
- writer.addDocument(doc);
- }
- if (VERBOSE) {
- System.out.println("TEST: now add vectors");
- }
- FieldType ft = new FieldType(TextField.TYPE_STORED);
- ft.setStoreTermVectors(true);
- ft.setStoreTermVectorOffsets(true);
- ft.setStoreTermVectorPositions(true);
- for(int i=0;i<10;i++) {
- Document doc = new Document();
- doc.add(newField("field", English.intToEnglish(100+i), ft));
- writer.addDocument(doc);
- }
-
- if (VERBOSE) {
- System.out.println("TEST: now getReader");
- }
- IndexReader reader = writer.getReader();
- writer.close();
- IndexSearcher searcher = newSearcher(reader);
-
- Query query = new TermQuery(new Term("field", "hundred"));
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(10, hits.length);
- for (int i = 0; i < hits.length; i++) {
-
- Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
- assertNotNull(vectors);
- assertEquals(1, vectors.size());
- }
- reader.close();
- }
-
-
// In a single doc, for the same field, mix the term
// vectors up
public void testMixedVectrosVectors() throws IOException {
Added: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java?rev=1441367&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java (added)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java Fri Feb 1 10:06:53 2013
@@ -0,0 +1,632 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
+/**
+ * Base class aiming at testing {@link TermVectorsFormat term vectors formats}.
+ * To test a new format, all you need is to register a new {@link Codec} which
+ * uses it and extend this class and override {@link #getCodec()}.
+ * @lucene.experimental
+ */
+public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
+
+ private Codec savedCodec;
+
+ /**
+ * Returns the Codec to run tests against
+ */
+ protected abstract Codec getCodec();
+
+ public void setUp() throws Exception {
+ super.setUp();
+ // set the default codec, so adding test cases to this isn't fragile
+ savedCodec = Codec.getDefault();
+ Codec.setDefault(getCodec());
+ }
+
+ public void tearDown() throws Exception {
+ Codec.setDefault(savedCodec); // restore
+ super.tearDown();
+ }
+
+ /**
+ * A combination of term vectors options.
+ */
+ protected enum Options {
+ NONE(false, false, false),
+ POSITIONS(true, false, false),
+ OFFSETS(false, true, false),
+ POSITIONS_AND_OFFSETS(true, true, false),
+ POSITIONS_AND_PAYLOADS(true, false, true),
+ POSITIONS_AND_OFFSETS_AND_PAYLOADS(true, true, true);
+ final boolean positions, offsets, payloads;
+ private Options(boolean positions, boolean offsets, boolean payloads) {
+ this.positions = positions;
+ this.offsets = offsets;
+ this.payloads = payloads;
+ }
+ }
+
+ protected Set<Options> validOptions() {
+ return EnumSet.allOf(Options.class);
+ }
+
+ protected Options randomOptions() {
+ return RandomPicks.randomFrom(random(), new ArrayList<Options>(validOptions()));
+ }
+
+ protected FieldType fieldType(Options options) {
+ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+ ft.setStoreTermVectors(true);
+ ft.setStoreTermVectorPositions(options.positions);
+ ft.setStoreTermVectorOffsets(options.offsets);
+ ft.setStoreTermVectorPayloads(options.payloads);
+ ft.freeze();
+ return ft;
+ }
+
+ protected BytesRef randomPayload() {
+ final int len = random().nextInt(5);
+ if (len == 0) {
+ return null;
+ }
+ final BytesRef payload = new BytesRef(len);
+ random().nextBytes(payload.bytes);
+ payload.length = len;
+ return payload;
+ }
+
+ // custom impl to test cases that are forbidden by the default OffsetAttribute impl
+ private static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {
+
+ int start, end;
+
+ @Override
+ public int startOffset() {
+ return start;
+ }
+
+ @Override
+ public int endOffset() {
+ return end;
+ }
+
+ @Override
+ public void setOffset(int startOffset, int endOffset) {
+ // no check!
+ start = startOffset;
+ end = endOffset;
+ }
+
+ @Override
+ public void clear() {
+ start = end = 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof PermissiveOffsetAttributeImpl) {
+ PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other;
+ return o.start == start && o.end == end;
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return start + 31 * end;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ OffsetAttribute t = (OffsetAttribute) target;
+ t.setOffset(start, end);
+ }
+
+ }
+
+ // TODO: use CannedTokenStream?
+ protected class RandomTokenStream extends TokenStream {
+
+ final String[] terms;
+ final BytesRef[] termBytes;
+ final int[] positionsIncrements;
+ final int[] positions;
+ final int[] startOffsets, endOffsets;
+ final BytesRef[] payloads;
+
+ final Map<String, Integer> freqs;
+ final Map<Integer, Set<Integer>> positionToTerms;
+ final Map<Integer, Set<Integer>> startOffsetToTerms;
+
+ final CharTermAttribute termAtt;
+ final PositionIncrementAttribute piAtt;
+ final OffsetAttribute oAtt;
+ final PayloadAttribute pAtt;
+ int i = 0;
+
+ protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
+ this(len, sampleTerms, sampleTermBytes, rarely());
+ }
+
+ protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
+ terms = new String[len];
+ termBytes = new BytesRef[len];
+ positionsIncrements = new int[len];
+ positions = new int[len];
+ startOffsets = new int[len];
+ endOffsets = new int[len];
+ payloads = new BytesRef[len];
+ for (int i = 0; i < len; ++i) {
+ final int o = random().nextInt(sampleTerms.length);
+ terms[i] = sampleTerms[o];
+ termBytes[i] = sampleTermBytes[o];
+ positionsIncrements[i] = _TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
+ if (offsetsGoBackwards) {
+ startOffsets[i] = random().nextInt();
+ endOffsets[i] = random().nextInt();
+ } else {
+ if (i == 0) {
+ startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16);
+ } else {
+ startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
+ }
+ endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
+ }
+ }
+
+ for (int i = 0; i < len; ++i) {
+ if (i == 0) {
+ positions[i] = positionsIncrements[i] - 1;
+ } else {
+ positions[i] = positions[i - 1] + positionsIncrements[i];
+ }
+ }
+ if (rarely()) {
+ Arrays.fill(payloads, randomPayload());
+ } else {
+ for (int i = 0; i < len; ++i) {
+ payloads[i] = randomPayload();
+ }
+ }
+
+ positionToTerms = new HashMap<Integer, Set<Integer>>(len);
+ startOffsetToTerms = new HashMap<Integer, Set<Integer>>(len);
+ for (int i = 0; i < len; ++i) {
+ if (!positionToTerms.containsKey(positions[i])) {
+ positionToTerms.put(positions[i], new HashSet<Integer>(1));
+ }
+ positionToTerms.get(positions[i]).add(i);
+ if (!startOffsetToTerms.containsKey(startOffsets[i])) {
+ startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
+ }
+ startOffsetToTerms.get(startOffsets[i]).add(i);
+ }
+
+ freqs = new HashMap<String, Integer>();
+ for (String term : terms) {
+ if (freqs.containsKey(term)) {
+ freqs.put(term, freqs.get(term) + 1);
+ } else {
+ freqs.put(term, 1);
+ }
+ }
+
+ addAttributeImpl(new PermissiveOffsetAttributeImpl());
+
+ termAtt = addAttribute(CharTermAttribute.class);
+ piAtt = addAttribute(PositionIncrementAttribute.class);
+ oAtt = addAttribute(OffsetAttribute.class);
+ pAtt = addAttribute(PayloadAttribute.class);
+ }
+
+ public boolean hasPayloads() {
+ for (BytesRef payload : payloads) {
+ if (payload != null && payload.length > 0) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (i < terms.length) {
+ termAtt.setLength(0).append(terms[i]);
+ piAtt.setPositionIncrement(positionsIncrements[i]);
+ oAtt.setOffset(startOffsets[i], endOffsets[i]);
+ pAtt.setPayload(payloads[i]);
+ ++i;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ }
+
+ protected class RandomDocument {
+
+ private final String[] fieldNames;
+ private final FieldType[] fieldTypes;
+ private final RandomTokenStream[] tokenStreams;
+
+ protected RandomDocument(int fieldCount, int maxTermCount, Options options, String[] fieldNames, String[] sampleTerms, BytesRef[] sampleTermBytes) {
+ if (fieldCount > fieldNames.length) {
+ throw new IllegalArgumentException();
+ }
+ this.fieldNames = new String[fieldCount];
+ fieldTypes = new FieldType[fieldCount];
+ tokenStreams = new RandomTokenStream[fieldCount];
+ Arrays.fill(fieldTypes, fieldType(options));
+ final Set<String> usedFileNames = new HashSet<String>();
+ for (int i = 0; i < fieldCount; ++i) {
+ do {
+ this.fieldNames[i] = RandomPicks.randomFrom(random(), fieldNames);
+ } while (usedFileNames.contains(this.fieldNames[i]));
+ usedFileNames.add(this.fieldNames[i]);
+ tokenStreams[i] = new RandomTokenStream(_TestUtil.nextInt(random(), 1, maxTermCount), sampleTerms, sampleTermBytes);
+ }
+ }
+
+ public Document toDocument() {
+ final Document doc = new Document();
+ for (int i = 0; i < fieldNames.length; ++i) {
+ doc.add(new Field(fieldNames[i], tokenStreams[i], fieldTypes[i]));
+ }
+ return doc;
+ }
+
+ }
+
+ protected class RandomDocumentFactory {
+
+ private final String[] fieldNames;
+ private final String[] terms;
+ private final BytesRef[] termBytes;
+
+ protected RandomDocumentFactory(int distinctFieldNames, int disctinctTerms) {
+ final Set<String> fieldNames = new HashSet<String>();
+ while (fieldNames.size() < distinctFieldNames) {
+ fieldNames.add(_TestUtil.randomSimpleString(random()));
+ fieldNames.remove("id");
+ }
+ this.fieldNames = fieldNames.toArray(new String[0]);
+ terms = new String[disctinctTerms];
+ termBytes = new BytesRef[disctinctTerms];
+ for (int i = 0; i < disctinctTerms; ++i) {
+ terms[i] = _TestUtil.randomRealisticUnicodeString(random());
+ termBytes[i] = new BytesRef(terms[i]);
+ }
+ }
+
+ public RandomDocument newDocument(int fieldCount, int maxTermCount, Options options) {
+ return new RandomDocument(fieldCount, maxTermCount, options, fieldNames, terms, termBytes);
+ }
+
+ }
+
+ protected void assertEquals(RandomDocument doc, Fields fields) throws IOException {
+ // compare field names
+ assertEquals(doc == null, fields == null);
+ assertEquals(doc.fieldNames.length, fields.size());
+ final Set<String> fields1 = new HashSet<String>();
+ final Set<String> fields2 = new HashSet<String>();
+ for (int i = 0; i < doc.fieldNames.length; ++i) {
+ fields1.add(doc.fieldNames[i]);
+ }
+ for (String field : fields) {
+ fields2.add(field);
+ }
+ assertEquals(fields1, fields2);
+
+ for (int i = 0; i < doc.fieldNames.length; ++i) {
+ assertEquals(doc.tokenStreams[i], doc.fieldTypes[i], fields.terms(doc.fieldNames[i]));
+ }
+ }
+
+ protected static boolean equals(Object o1, Object o2) {
+ if (o1 == null) {
+ return o2 == null;
+ } else {
+ return o1.equals(o2);
+ }
+ }
+
+ // to test reuse
+ private TermsEnum termsEnum = null;
+ private DocsEnum docsEnum = null;
+ private DocsAndPositionsEnum docsAndPositionsEnum = null;
+
+ protected void assertEquals(RandomTokenStream tk, FieldType ft, Terms terms) throws IOException {
+ assertEquals(1, terms.getDocCount());
+ final int termCount = new HashSet<String>(Arrays.asList(tk.terms)).size();
+ assertEquals(termCount, terms.size());
+ assertEquals(termCount, terms.getSumDocFreq());
+ assertEquals(ft.storeTermVectorPositions(), terms.hasPositions());
+ assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets());
+ assertEquals(ft.storeTermVectorPayloads() && tk.hasPayloads(), terms.hasPayloads());
+ final Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
+ for (String term : tk.freqs.keySet()) {
+ uniqueTerms.add(new BytesRef(term));
+ }
+ final BytesRef[] sortedTerms = uniqueTerms.toArray(new BytesRef[0]);
+ Arrays.sort(sortedTerms, terms.getComparator());
+ termsEnum = terms.iterator(random().nextBoolean() ? null : termsEnum);
+ for (int i = 0; i < sortedTerms.length; ++i) {
+ final BytesRef nextTerm = termsEnum.next();
+ assertEquals(sortedTerms[i], nextTerm);
+ assertEquals(sortedTerms[i], termsEnum.term());
+ assertEquals(1, termsEnum.docFreq());
+
+ final FixedBitSet bits = new FixedBitSet(1);
+ docsEnum = termsEnum.docs(bits, random().nextBoolean() ? null : docsEnum);
+ assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
+ bits.set(0);
+
+ docsEnum = termsEnum.docs(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsEnum);
+ assertNotNull(docsEnum);
+ assertEquals(0, docsEnum.nextDoc());
+ assertEquals(0, docsEnum.docID());
+ assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) docsEnum.freq());
+ assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
+
+ bits.clear(0);
+ docsAndPositionsEnum = termsEnum.docsAndPositions(bits, random().nextBoolean() ? null : docsAndPositionsEnum);
+ assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null);
+ if (docsAndPositionsEnum != null) {
+ assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
+ }
+ bits.set(0);
+
+ docsAndPositionsEnum = termsEnum.docsAndPositions(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsAndPositionsEnum);
+ assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null);
+ if (terms.hasPositions() || terms.hasOffsets()) {
+ assertEquals(0, docsAndPositionsEnum.nextDoc());
+ final int freq = docsAndPositionsEnum.freq();
+ assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) freq);
+ if (docsAndPositionsEnum != null) {
+ for (int k = 0; k < freq; ++k) {
+ final int position = docsAndPositionsEnum.nextPosition();
+ final Set<Integer> indexes;
+ if (terms.hasPositions()) {
+ indexes = tk.positionToTerms.get(position);
+ assertNotNull(indexes);
+ } else {
+ indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset());
+ assertNotNull(indexes);
+ }
+ if (terms.hasPositions()) {
+ boolean foundPosition = false;
+ for (int index : indexes) {
+ if (tk.termBytes[index].equals(termsEnum.term()) && tk.positions[index] == position) {
+ foundPosition = true;
+ break;
+ }
+ }
+ assertTrue(foundPosition);
+ }
+ if (terms.hasOffsets()) {
+ boolean foundOffset = false;
+ for (int index : indexes) {
+ if (tk.termBytes[index].equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) {
+ foundOffset = true;
+ break;
+ }
+ }
+ assertTrue(foundOffset);
+ }
+ if (terms.hasPayloads()) {
+ boolean foundPayload = false;
+ for (int index : indexes) {
+ if (tk.termBytes[index].equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) {
+ foundPayload = true;
+ break;
+ }
+ }
+ assertTrue(foundPayload);
+ }
+ }
+ try {
+ docsAndPositionsEnum.nextPosition();
+ fail();
+ } catch (Exception e) {
+ // ok
+ } catch (AssertionError e) {
+ // ok
+ }
+ }
+ assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
+ }
+ }
+ assertNull(termsEnum.next());
+ for (int i = 0; i < 5; ++i) {
+ if (random().nextBoolean()) {
+ assertTrue(termsEnum.seekExact(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean()));
+ } else {
+ assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean()));
+ }
+ }
+ }
+
+ protected Document addId(Document doc, String id) {
+ doc.add(new StringField("id", id, Store.NO));
+ return doc;
+ }
+
+ protected int docID(IndexReader reader, String id) throws IOException {
+ return new IndexSearcher(reader).search(new TermQuery(new Term("id", id)), 1).scoreDocs[0].doc;
+ }
+
+ // only one doc with vectors
+ public void testRareVectors() throws IOException {
+ final RandomDocumentFactory docFactory = new RandomDocumentFactory(10, 20);
+ for (Options options : validOptions()) {
+ final int numDocs = _TestUtil.nextInt(random(), 10, 10000);
+ final int docWithVectors = random().nextInt(numDocs);
+ final Document emptyDoc = new Document();
+ final Directory dir = newDirectory();
+ final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), 20, options);
+ for (int i = 0; i < numDocs; ++i) {
+ if (i == docWithVectors) {
+ writer.addDocument(addId(doc.toDocument(), "42"));
+ } else {
+ writer.addDocument(emptyDoc);
+ }
+ }
+ final IndexReader reader = writer.getReader();
+ final int docWithVectorsID = docID(reader, "42");
+ for (int i = 0; i < 10; ++i) {
+ final int docID = random().nextInt(numDocs);
+ final Fields fields = reader.getTermVectors(docID);
+ if (docID == docWithVectorsID) {
+ assertEquals(doc, fields);
+ } else {
+ assertNull(fields);
+ }
+ }
+ final Fields fields = reader.getTermVectors(docWithVectorsID);
+ assertEquals(doc, fields);
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+ }
+
+ public void testHighFreqs() throws IOException {
+ final RandomDocumentFactory docFactory = new RandomDocumentFactory(3, 5);
+ for (Options options : validOptions()) {
+ if (options == Options.NONE) {
+ continue;
+ }
+ final Directory dir = newDirectory();
+ final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 2), _TestUtil.nextInt(random(), 50000, 100000), options);
+ writer.addDocument(doc.toDocument());
+ final IndexReader reader = writer.getReader();
+ assertEquals(doc, reader.getTermVectors(0));
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+ }
+
+ public void testLotsOfFields() throws IOException {
+ final RandomDocumentFactory docFactory = new RandomDocumentFactory(5000, 10);
+ for (Options options : validOptions()) {
+ final Directory dir = newDirectory();
+ final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 500, 1000), 5, options);
+ writer.addDocument(doc.toDocument());
+ final IndexReader reader = writer.getReader();
+ assertEquals(doc, reader.getTermVectors(0));
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+ }
+
+ // different options for the same field
+ public void testMixedOptions() throws IOException {
+ final int numFields = _TestUtil.nextInt(random(), 1, 3);
+ final RandomDocumentFactory docFactory = new RandomDocumentFactory(numFields, 10);
+ for (Options options1 : validOptions()) {
+ for (Options options2 : validOptions()) {
+ if (options1 == options2) {
+ continue;
+ }
+ final Directory dir = newDirectory();
+ final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ final RandomDocument doc1 = docFactory.newDocument(numFields, 20, options1);
+ final RandomDocument doc2 = docFactory.newDocument(numFields, 20, options2);
+ writer.addDocument(addId(doc1.toDocument(), "1"));
+ writer.addDocument(addId(doc2.toDocument(), "2"));
+ final IndexReader reader = writer.getReader();
+ final int doc1ID = docID(reader, "1");
+ assertEquals(doc1, reader.getTermVectors(doc1ID));
+ final int doc2ID = docID(reader, "2");
+ assertEquals(doc2, reader.getTermVectors(doc2ID));
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+ }
+ }
+
+ public void testRandom() throws IOException {
+ final RandomDocumentFactory docFactory = new RandomDocumentFactory(5, 20);
+ final int numDocs = _TestUtil.nextInt(random(), 100, 1000);
+ final RandomDocument[] docs = new RandomDocument[numDocs];
+ for (int i = 0; i < numDocs; ++i) {
+ docs[i] = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), _TestUtil.nextInt(random(), 10, 50), randomOptions());
+ }
+ final Directory dir = newDirectory();
+ final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ for (int i = 0; i < numDocs; ++i) {
+ writer.addDocument(docs[i].toDocument());
+ }
+ final IndexReader reader = writer.getReader();
+ for (int i = 0; i < numDocs; ++i) {
+ assertEquals(docs[i], reader.getTermVectors(i));
+ }
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+}