You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/02/17 00:09:19 UTC
svn commit: r1446977 - in /lucene/dev/branches/lucene4765/lucene:
codecs/src/java/org/apache/lucene/codecs/simpletext/
core/src/test/org/apache/lucene/
Author: rmuir
Date: Sat Feb 16 23:09:19 2013
New Revision: 1446977
URL: http://svn.apache.org/r1446977
Log:
add simpletext sorted_set
Modified:
lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java
lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
lucene/dev/branches/lucene4765/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
Modified: lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java?rev=1446977&r1=1446976&r2=1446977&view=diff
==============================================================================
--- lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java Sat Feb 16 23:09:19 2013
@@ -82,6 +82,31 @@ import org.apache.lucene.index.SegmentWr
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
* a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
+ *
+ * for sorted set this is a fixed-width file very similar to the SORTED case, for example:
+ * <pre>
+ * field myField
+ * type SORTED_SET
+ * numvalues 10
+ * maxLength 8
+ * pattern 0
+ * ordpattern XXXXX
+ * length 6
+ * foobar[space][space]
+ * length 3
+ * baz[space][space][space][space][space]
+ * ...
+ * 0,3,5
+ * 1,2
+ *
+ * 10
+ * ...
+ * </pre>
+ * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
+ * a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
+ * this is a comma-separated list, and its padded with spaces to be fixed width. so trim() and split() it.
+ * and beware the empty string!
+ * an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
*
* the reader can just scan this file when it opens, skipping over the data blocks
* and saving the offset/etc for each field.
Modified: lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java?rev=1446977&r1=1446976&r2=1446977&view=diff
==============================================================================
--- lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java (original)
+++ lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java Sat Feb 16 23:09:19 2013
@@ -60,7 +60,7 @@ class SimpleTextDocValuesReader extends
int maxLength;
boolean fixedLength;
long minValue;
- int numValues;
+ long numValues;
};
final int maxDoc;
@@ -110,10 +110,10 @@ class SimpleTextDocValuesReader extends
field.pattern = stripPrefix(PATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc);
- } else if (dvType == DocValuesType.SORTED) {
+ } else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
readLine();
assert startsWith(NUMVALUES);
- field.numValues = Integer.parseInt(stripPrefix(NUMVALUES));
+ field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
readLine();
assert startsWith(MAXLENGTH);
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
@@ -280,14 +280,87 @@ class SimpleTextDocValuesReader extends
@Override
public int getValueCount() {
- return field.numValues;
+ return (int)field.numValues;
}
};
}
@Override
- public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
- throw new UnsupportedOperationException(); // nocommit
+ public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
+ final OneField field = fields.get(fieldInfo.name);
+
+ // SegmentCoreReaders already verifies this field is
+ // valid:
+ assert field != null;
+
+ final IndexInput in = data.clone();
+ final BytesRef scratch = new BytesRef();
+ final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
+
+ return new SortedSetDocValues() {
+ String[] currentOrds = new String[0];
+ int currentIndex = 0;
+
+ @Override
+ public long nextOrd() {
+ if (currentIndex == currentOrds.length) {
+ return NO_MORE_ORDS;
+ } else {
+ return Long.parseLong(currentOrds[currentIndex++]);
+ }
+ }
+
+ @Override
+ public void setDocument(int docID) {
+ if (docID < 0 || docID >= maxDoc) {
+ throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
+ }
+ try {
+ in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
+ SimpleTextUtil.readLine(in, scratch);
+ String ordList = scratch.utf8ToString().trim();
+ if (ordList.isEmpty()) {
+ currentOrds = new String[0];
+ } else {
+ currentOrds = ordList.split(",");
+ }
+ currentIndex = 0;
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+
+ @Override
+ public void lookupOrd(long ord, BytesRef result) {
+ try {
+ if (ord < 0 || ord >= field.numValues) {
+ throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
+ }
+ in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
+ SimpleTextUtil.readLine(in, scratch);
+ assert StringHelper.startsWith(scratch, LENGTH): "got " + scratch.utf8ToString() + " in=" + in;
+ int len;
+ try {
+ len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
+ } catch (ParseException pe) {
+ CorruptIndexException e = new CorruptIndexException("failed to parse int length");
+ e.initCause(pe);
+ throw e;
+ }
+ result.bytes = new byte[len];
+ result.offset = 0;
+ result.length = len;
+ in.readBytes(result.bytes, 0, len);
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+
+ @Override
+ public long getValueCount() {
+ return field.numValues;
+ }
+ };
}
@Override
Modified: lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java?rev=1446977&r1=1446976&r2=1446977&view=diff
==============================================================================
--- lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java (original)
+++ lucene/dev/branches/lucene4765/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java Sat Feb 16 23:09:19 2013
@@ -22,6 +22,7 @@ import java.math.BigInteger;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
@@ -252,7 +253,111 @@ class SimpleTextDocValuesWriter extends
@Override
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
- throw new UnsupportedOperationException(); // nocommit
+ assert fieldSeen(field.name);
+ assert field.getDocValuesType() == DocValuesType.SORTED_SET;
+ writeFieldEntry(field, FieldInfo.DocValuesType.SORTED_SET);
+
+ long valueCount = 0;
+ int maxLength = 0;
+ for(BytesRef value : values) {
+ maxLength = Math.max(maxLength, value.length);
+ valueCount++;
+ }
+
+ // write numValues
+ SimpleTextUtil.write(data, NUMVALUES);
+ SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
+ SimpleTextUtil.writeNewline(data);
+
+ // write maxLength
+ SimpleTextUtil.write(data, MAXLENGTH);
+ SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
+ SimpleTextUtil.writeNewline(data);
+
+ int maxBytesLength = Integer.toString(maxLength).length();
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < maxBytesLength; i++) {
+ sb.append('0');
+ }
+
+ // write our pattern for encoding lengths
+ SimpleTextUtil.write(data, PATTERN);
+ SimpleTextUtil.write(data, sb.toString(), scratch);
+ SimpleTextUtil.writeNewline(data);
+ final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
+
+ // compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
+ int maxOrdListLength = 0;
+ StringBuilder sb2 = new StringBuilder();
+ Iterator<Number> ordStream = ords.iterator();
+ for (Number n : docToOrdCount) {
+ sb2.setLength(0);
+ int count = n.intValue();
+ for (int i = 0; i < count; i++) {
+ long ord = ordStream.next().longValue();
+ if (sb2.length() > 0) {
+ sb2.append(",");
+ }
+ sb2.append(Long.toString(ord));
+ }
+ maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
+ }
+
+ sb2.setLength(0);
+ for (int i = 0; i < maxOrdListLength; i++) {
+ sb2.append('X');
+ }
+
+ // write our pattern for ord lists
+ SimpleTextUtil.write(data, ORDPATTERN);
+ SimpleTextUtil.write(data, sb2.toString(), scratch);
+ SimpleTextUtil.writeNewline(data);
+
+ // for asserts:
+ long valuesSeen = 0;
+
+ for(BytesRef value : values) {
+ // write length
+ SimpleTextUtil.write(data, LENGTH);
+ SimpleTextUtil.write(data, encoder.format(value.length), scratch);
+ SimpleTextUtil.writeNewline(data);
+
+ // write bytes -- don't use SimpleText.write
+ // because it escapes:
+ data.writeBytes(value.bytes, value.offset, value.length);
+
+ // pad to fit
+ for (int i = value.length; i < maxLength; i++) {
+ data.writeByte((byte)' ');
+ }
+ SimpleTextUtil.writeNewline(data);
+ valuesSeen++;
+ assert valuesSeen <= valueCount;
+ }
+
+ assert valuesSeen == valueCount;
+
+ ordStream = ords.iterator();
+
+ // write the ords for each doc comma-separated
+ for(Number n : docToOrdCount) {
+ sb2.setLength(0);
+ int count = n.intValue();
+ for (int i = 0; i < count; i++) {
+ long ord = ordStream.next().longValue();
+ if (sb2.length() > 0) {
+ sb2.append(",");
+ }
+ sb2.append(Long.toString(ord));
+ }
+ // now pad to fit: these are numbers so spaces work well. reader calls trim()
+ int numPadding = maxOrdListLength - sb2.length();
+ for (int i = 0; i < numPadding; i++) {
+ sb2.append(' ');
+ }
+ SimpleTextUtil.write(data, sb2.toString(), scratch);
+ SimpleTextUtil.writeNewline(data);
+ }
}
/** write the header for this field */
Modified: lucene/dev/branches/lucene4765/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4765/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java?rev=1446977&r1=1446976&r2=1446977&view=diff
==============================================================================
--- lucene/dev/branches/lucene4765/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java (original)
+++ lucene/dev/branches/lucene4765/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java Sat Feb 16 23:09:19 2013
@@ -55,7 +55,7 @@ import static org.apache.lucene.index.So
*/
// nocommit: should only be Lucene40 and Lucene41
// nocommit: move to BaseDocValuesTestCase, but allow these to be assume()d (for 4.0 and 4.1)
-@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText" })
+@SuppressCodecs({ "Lucene40", "Lucene41" })
public class TestDemoDocValue extends LuceneTestCase {
public void testSortedSetOneValue() throws IOException {