You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/11/15 18:05:56 UTC
svn commit: r1202305 - in /lucene/dev/branches/lucene2621/lucene/src: java/org/apache/lucene/index/codecs/simpletext/ test/org/apache/lucene/index/

Author: rmuir
Date: Tue Nov 15 17:05:56 2011
New Revision: 1202305

URL: http://svn.apache.org/viewvc?rev=1202305&view=rev
Log:
LUCENE-2621: SimpleText term vectors

Added:
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsFormat.java   (with props)
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsReader.java   (with props)
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsWriter.java   (with props)
Modified:
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java
    lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java?rev=1202305&r1=1202304&r2=1202305&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java Tue Nov 15 17:05:56 2011
@@ -19,7 +19,6 @@ package org.apache.lucene.index.codecs.s
 
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.DefaultDocValuesFormat;
-import org.apache.lucene.index.codecs.DefaultTermVectorsFormat;
 import org.apache.lucene.index.codecs.DocValuesFormat;
 import org.apache.lucene.index.codecs.FieldInfosFormat;
 import org.apache.lucene.index.codecs.PostingsFormat;
@@ -38,9 +37,7 @@ public final class SimpleTextCodec exten
   private final StoredFieldsFormat storedFields = new SimpleTextStoredFieldsFormat();
   private final SegmentInfosFormat segmentInfos = new SimpleTextSegmentInfosFormat();
   private final FieldInfosFormat fieldInfosFormat = new SimpleTextFieldInfosFormat();
-
-  // nocommit: need a plain-text impl
-  private final TermVectorsFormat vectorsFormat = new DefaultTermVectorsFormat();
+  private final TermVectorsFormat vectorsFormat = new SimpleTextTermVectorsFormat();
   // TODO: need a plain-text impl
   private final DocValuesFormat docValues = new DefaultDocValuesFormat();
   

Added: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsFormat.java?rev=1202305&view=auto
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsFormat.java (added)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsFormat.java Tue Nov 15 17:05:56 2011
@@ -0,0 +1,53 @@
+package org.apache.lucene.index.codecs.simpletext;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.codecs.TermVectorsFormat;
+import org.apache.lucene.index.codecs.TermVectorsReader;
+import org.apache.lucene.index.codecs.TermVectorsWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+
+/**
+ * plain text term vectors format.
+ * <p>
+ * <b><font color="red">FOR RECREATIONAL USE ONLY</font></B>
+ * @lucene.experimental
+ */
+public class SimpleTextTermVectorsFormat extends TermVectorsFormat {
+
+  @Override
+  public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
+    return new SimpleTextTermVectorsReader(directory, segmentInfo, fieldInfos, context);
+  }
+
+  @Override
+  public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
+    return new SimpleTextTermVectorsWriter(directory, segment, context);
+  }
+
+  @Override
+  public void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
+    SimpleTextTermVectorsReader.files(dir, info, files);
+  }
+}

Added: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsReader.java?rev=1202305&view=auto
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsReader.java (added)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsReader.java Tue Nov 15 17:05:56 2011
@@ -0,0 +1,526 @@
+package org.apache.lucene.index.codecs.simpletext;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.codecs.TermVectorsReader;
+import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.UnicodeUtil;
+
+import static org.apache.lucene.index.codecs.simpletext.SimpleTextTermVectorsWriter.*;
+
+/**
+ * Reads plain-text term vectors.
+ * <p>
+ * <b><font color="red">FOR RECREATIONAL USE ONLY</font></B>
+ * @lucene.experimental
+ */
+public class SimpleTextTermVectorsReader extends TermVectorsReader {
+  private ArrayList<Long> offsets; /* docid -> offset in .vec file */
+  private IndexInput in;
+  private BytesRef scratch = new BytesRef();
+  private CharsRef scratchUTF16 = new CharsRef();
+  
+  public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, FieldInfos fieldInfos, IOContext context) throws IOException {
+    boolean success = false;
+    try {
+      in = directory.openInput(IndexFileNames.segmentFileName(si.name, "", VECTORS_EXTENSION), context);
+      success = true;
+    } finally {
+      if (!success) {
+        close();
+      }
+    }
+    readIndex();
+  }
+  
+  // used by clone
+  SimpleTextTermVectorsReader(ArrayList<Long> offsets, IndexInput in) {
+    this.offsets = offsets;
+    this.in = in;
+  }
+  
+  // we don't actually write a .tvx-like index, instead we read the 
+  // vectors file in entirety up-front and save the offsets 
+  // so we can seek to the data later.
+  private void readIndex() throws IOException {
+    offsets = new ArrayList<Long>();
+    while (!scratch.equals(END)) {
+      readLine();
+      if (scratch.startsWith(DOC)) {
+        offsets.add(in.getFilePointer());
+      }
+    }
+  }
+  
+  @Override
+  public Fields get(int doc) throws IOException {
+    // TestTV tests for this in testBadParams... but is this
+    // really guaranteed by the API?
+    if (doc < 0 || doc >= offsets.size()) {
+      throw new IllegalArgumentException("doc id out of range");
+    }
+
+    SortedMap<String,SimpleTVTerms> fields = new TreeMap<String,SimpleTVTerms>();
+    in.seek(offsets.get(doc));
+    readLine();
+    assert scratch.startsWith(NUMFIELDS);
+    int numFields = parseIntAt(NUMFIELDS.length);
+    if (numFields == 0) {
+      return null; // no vectors for this doc
+    }
+    for (int i = 0; i < numFields; i++) {
+      readLine();
+      assert scratch.startsWith(FIELD);
+      int fieldNumber = parseIntAt(FIELD.length);
+      
+      readLine();
+      assert scratch.startsWith(FIELDNAME);
+      String fieldName = readString(FIELDNAME.length, scratch);
+      
+      readLine();
+      assert scratch.startsWith(FIELDPOSITIONS);
+      boolean positions = Boolean.parseBoolean(readString(FIELDPOSITIONS.length, scratch));
+      
+      readLine();
+      assert scratch.startsWith(FIELDOFFSETS);
+      boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch));
+      
+      readLine();
+      assert scratch.startsWith(FIELDTERMCOUNT);
+      int termCount = parseIntAt(FIELDTERMCOUNT.length);
+      
+      SimpleTVTerms terms = new SimpleTVTerms();
+      fields.put(fieldName, terms);
+      
+      for (int j = 0; j < termCount; j++) {
+        readLine();
+        assert scratch.startsWith(TERMTEXT);
+        BytesRef term = new BytesRef();
+        int termLength = scratch.length - TERMTEXT.length;
+        term.grow(termLength);
+        term.length = termLength;
+        System.arraycopy(scratch.bytes, scratch.offset+TERMTEXT.length, term.bytes, term.offset, termLength);
+        
+        SimpleTVPostings postings = new SimpleTVPostings();
+        terms.terms.put(term, postings);
+        
+        readLine();
+        assert scratch.startsWith(TERMFREQ);
+        postings.freq = parseIntAt(TERMFREQ.length);
+        
+        if (positions || offsets) {
+          if (positions) {
+            postings.positions = new int[postings.freq];
+          }
+        
+          if (offsets) {
+            postings.startOffsets = new int[postings.freq];
+            postings.endOffsets = new int[postings.freq];
+          }
+          
+          for (int k = 0; k < postings.freq; k++) {
+            if (positions) {
+              readLine();
+              assert scratch.startsWith(POSITION);
+              postings.positions[k] = parseIntAt(POSITION.length);
+            }
+            
+            if (offsets) {
+              readLine();
+              assert scratch.startsWith(STARTOFFSET);
+              postings.startOffsets[k] = parseIntAt(STARTOFFSET.length);
+              
+              readLine();
+              assert scratch.startsWith(ENDOFFSET);
+              postings.endOffsets[k] = parseIntAt(ENDOFFSET.length);
+            }
+          }
+        }
+      }
+    }
+    return new SimpleTVFields(fields);
+  }
+
+  @Override
+  public TermVectorsReader clone() {
+    if (in == null) {
+      throw new AlreadyClosedException("this TermVectorsReader is closed");
+    }
+    return new SimpleTextTermVectorsReader(offsets, (IndexInput) in.clone());
+  }
+  
+  @Override
+  public void close() throws IOException {
+    try {
+      IOUtils.close(in); 
+    } finally {
+      in = null;
+      offsets = null;
+    }
+  }
+  
+  public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
+    if (info.getHasVectors()) {
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_EXTENSION));
+    }
+  }
+  
+  private void readLine() throws IOException {
+    SimpleTextUtil.readLine(in, scratch);
+  }
+  
+  private int parseIntAt(int offset) throws IOException {
+    UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+offset, scratch.length-offset, scratchUTF16);
+    return ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
+  }
+  
+  private String readString(int offset, BytesRef scratch) {
+    UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+offset, scratch.length-offset, scratchUTF16);
+    return scratchUTF16.toString();
+  }
+  
+  private class SimpleTVFields extends Fields {
+    private final SortedMap<String,SimpleTVTerms> fields;
+    
+    SimpleTVFields(SortedMap<String,SimpleTVTerms> fields) throws IOException {
+      this.fields = fields;
+    }
+
+    @Override
+    public FieldsEnum iterator() throws IOException {
+      return new FieldsEnum() {
+        private Iterator<Map.Entry<String,SimpleTVTerms>> iterator = fields.entrySet().iterator();
+        private Map.Entry<String,SimpleTVTerms> current = null;
+        
+        @Override
+        public String next() throws IOException {
+          if (!iterator.hasNext()) {
+            return null;
+          } else {
+            current = iterator.next();
+            return current.getKey();
+          }
+        }
+
+        @Override
+        public Terms terms() throws IOException {
+          return current.getValue();
+        }
+      };
+    }
+
+    @Override
+    public Terms terms(String field) throws IOException {
+      return fields.get(field);
+    }
+
+    @Override
+    public int getUniqueFieldCount() throws IOException {
+      return fields.size();
+    }
+  }
+  
+  private static class SimpleTVTerms extends Terms {
+    final SortedMap<BytesRef,SimpleTVPostings> terms;
+    
+    SimpleTVTerms() {
+      terms = new TreeMap<BytesRef,SimpleTVPostings>();
+    }
+    
+    @Override
+    public TermsEnum iterator(TermsEnum reuse) throws IOException {
+      // TODO: reuse
+      return new SimpleTVTermsEnum(terms);
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() throws IOException {
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+
+    @Override
+    public long getUniqueTermCount() throws IOException {
+      return terms.size();
+    }
+
+    @Override
+    public long getSumTotalTermFreq() throws IOException {
+      return -1;
+    }
+
+    @Override
+    public long getSumDocFreq() throws IOException {
+      return terms.size();
+    }
+
+    @Override
+    public int getDocCount() throws IOException {
+      return 1;
+    }
+  }
+  
+  private static class SimpleTVPostings {
+    private int freq;
+    private int positions[];
+    private int startOffsets[];
+    private int endOffsets[];
+  }
+  
+  private static class SimpleTVTermsEnum extends TermsEnum {
+    SortedMap<BytesRef,SimpleTVPostings> terms;
+    Iterator<Map.Entry<BytesRef,SimpleTextTermVectorsReader.SimpleTVPostings>> iterator;
+    Map.Entry<BytesRef,SimpleTextTermVectorsReader.SimpleTVPostings> current;
+    
+    SimpleTVTermsEnum(SortedMap<BytesRef,SimpleTVPostings> terms) {
+      this.terms = terms;
+      this.iterator = terms.entrySet().iterator();
+    }
+    
+    @Override
+    public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException {
+      iterator = terms.tailMap(text).entrySet().iterator();
+      if (!iterator.hasNext()) {
+        return SeekStatus.END;
+      } else {
+        return next().equals(text) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+      }
+    }
+
+    @Override
+    public void seekExact(long ord) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public BytesRef next() throws IOException {
+      if (!iterator.hasNext()) {
+        return null;
+      } else {
+        current = iterator.next();
+        return current.getKey();
+      }
+    }
+
+    @Override
+    public BytesRef term() throws IOException {
+      return current.getKey();
+    }
+
+    @Override
+    public long ord() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int docFreq() throws IOException {
+      return 1;
+    }
+
+    @Override
+    public long totalTermFreq() throws IOException {
+      return current.getValue().freq;
+    }
+
+    @Override
+    public DocsEnum docs(Bits liveDocs, DocsEnum reuse) throws IOException {
+      // TODO: reuse
+      SimpleTVDocsEnum e = new SimpleTVDocsEnum();
+      e.reset(liveDocs, current.getValue().freq);
+      return e;
+    }
+
+    @Override
+    public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
+      SimpleTVPostings postings = current.getValue();
+      if (postings.positions == null && postings.startOffsets == null) {
+        return null;
+      }
+      // TODO: reuse
+      SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum(postings.startOffsets != null);
+      e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets);
+      return e;
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() throws IOException {
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+  }
+  
+  // note: these two enum classes are exactly like the Default impl...
+  private static class SimpleTVDocsEnum extends DocsEnum {
+    private boolean didNext;
+    private int freq;
+    private Bits liveDocs;
+
+    @Override
+    public int freq() {
+      return freq;
+    }
+
+    @Override
+    public int docID() {
+      return 0;
+    }
+
+    @Override
+    public int nextDoc() {
+      if (!didNext && (liveDocs == null || liveDocs.get(0))) {
+        didNext = true;
+        return 0;
+      } else {
+        return NO_MORE_DOCS;
+      }
+    }
+
+    @Override
+    public int advance(int target) {
+      if (!didNext && target == 0) {
+        return nextDoc();
+      } else {
+        return NO_MORE_DOCS;
+      }
+    }
+
+    public void reset(Bits liveDocs, int freq) {
+      this.liveDocs = liveDocs;
+      this.freq = freq;
+      didNext = false;
+    }
+  }
+  
+  private static class SimpleTVDocsAndPositionsEnum extends DocsAndPositionsEnum {
+    private final OffsetAttribute offsetAtt;
+    private boolean didNext;
+    private int nextPos;
+    private Bits liveDocs;
+    private int[] positions;
+    private int[] startOffsets;
+    private int[] endOffsets;
+
+    public SimpleTVDocsAndPositionsEnum(boolean storeOffsets) {
+      if (storeOffsets) {
+        offsetAtt = attributes().addAttribute(OffsetAttribute.class);
+      } else {
+        offsetAtt = null;
+      }
+    }
+
+    public boolean canReuse(boolean storeOffsets) {
+      return storeOffsets == (offsetAtt != null);
+    }
+
+    @Override
+    public int freq() {
+      if (positions != null) {
+        return positions.length;
+      } else {
+        assert startOffsets != null;
+        return startOffsets.length;
+      }
+    }
+
+    @Override
+    public int docID() {
+      return 0;
+    }
+
+    @Override
+    public int nextDoc() {
+      if (!didNext && (liveDocs == null || liveDocs.get(0))) {
+        didNext = true;
+        return 0;
+      } else {
+        return NO_MORE_DOCS;
+      }
+    }
+
+    @Override
+    public int advance(int target) {
+      if (!didNext && target == 0) {
+        return nextDoc();
+      } else {
+        return NO_MORE_DOCS;
+      }
+    }
+
+    public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
+      this.liveDocs = liveDocs;
+      this.positions = positions;
+      this.startOffsets = startOffsets;
+      assert (offsetAtt != null) == (startOffsets != null);
+      this.endOffsets = endOffsets;
+      didNext = false;
+      nextPos = 0;
+    }
+
+    @Override
+    public BytesRef getPayload() {
+      return null;
+    }
+
+    @Override
+    public boolean hasPayload() {
+      return false;
+    }
+
+    @Override
+    public int nextPosition() {
+      assert (positions != null && nextPos < positions.length) ||
+        startOffsets != null && nextPos < startOffsets.length;
+
+      if (startOffsets != null) {
+        offsetAtt.setOffset(startOffsets[nextPos],
+                            endOffsets[nextPos]);
+      }
+      if (positions != null) {
+        return positions[nextPos++];
+      } else {
+        nextPos++;
+        return -1;
+      }
+    }
+  }
+}

Added: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsWriter.java?rev=1202305&view=auto
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsWriter.java (added)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextTermVectorsWriter.java Tue Nov 15 17:05:56 2011
@@ -0,0 +1,179 @@
+package org.apache.lucene.index.codecs.simpletext;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.codecs.TermVectorsWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Writes plain-text term vectors.
+ * <p>
+ * <b><font color="red">FOR RECREATIONAL USE ONLY</font></B>
+ * @lucene.experimental
+ */
+public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
+  
+  static final BytesRef END                = new BytesRef("END");
+  static final BytesRef DOC                = new BytesRef("doc ");
+  static final BytesRef NUMFIELDS          = new BytesRef("  numfields ");
+  static final BytesRef FIELD              = new BytesRef("  field ");
+  static final BytesRef FIELDNAME          = new BytesRef("    name ");
+  static final BytesRef FIELDPOSITIONS     = new BytesRef("    positions ");
+  static final BytesRef FIELDOFFSETS       = new BytesRef("    offsets   ");
+  static final BytesRef FIELDTERMCOUNT     = new BytesRef("    numterms ");
+  static final BytesRef TERMTEXT           = new BytesRef("    term ");
+  static final BytesRef TERMFREQ           = new BytesRef("      freq ");
+  static final BytesRef POSITION           = new BytesRef("      position ");
+  static final BytesRef STARTOFFSET        = new BytesRef("        startoffset ");
+  static final BytesRef ENDOFFSET          = new BytesRef("        endoffset ");
+
+  static final String VECTORS_EXTENSION = "vec";
+  
+  private final Directory directory;
+  private final String segment;
+  private IndexOutput out;
+  private int numDocsWritten = 0;
+  private final BytesRef scratch = new BytesRef();
+  private boolean offsets;
+  private boolean positions;
+
+  public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
+    this.directory = directory;
+    this.segment = segment;
+    out = directory.createOutput(IndexFileNames.segmentFileName(segment, "", VECTORS_EXTENSION), context);
+  }
+  
+  @Override
+  public void startDocument(int numVectorFields) throws IOException {
+    write(DOC);
+    write(Integer.toString(numDocsWritten));
+    newLine();
+    
+    write(NUMFIELDS);
+    write(Integer.toString(numVectorFields));
+    newLine();
+    numDocsWritten++;
+  }
+
+  @Override
+  public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {  
+    write(FIELD);
+    write(Integer.toString(info.number));
+    newLine();
+    
+    write(FIELDNAME);
+    write(info.name);
+    newLine();
+    
+    write(FIELDPOSITIONS);
+    write(Boolean.toString(positions));
+    newLine();
+    
+    write(FIELDOFFSETS);
+    write(Boolean.toString(offsets));
+    newLine();
+    
+    write(FIELDTERMCOUNT);
+    write(Integer.toString(numTerms));
+    newLine();
+    
+    this.positions = positions;
+    this.offsets = offsets;
+  }
+
+  @Override
+  public void startTerm(BytesRef term, int freq) throws IOException {
+    write(TERMTEXT);
+    write(term);
+    newLine();
+    
+    write(TERMFREQ);
+    write(Integer.toString(freq));
+    newLine();
+  }
+
+  @Override
+  public void addPosition(int position, int startOffset, int endOffset) throws IOException {
+    assert positions || offsets;
+    
+    if (positions) {
+      write(POSITION);
+      write(Integer.toString(position));
+      newLine();
+    }
+    
+    if (offsets) {
+      write(STARTOFFSET);
+      write(Integer.toString(startOffset));
+      newLine();
+      
+      write(ENDOFFSET);
+      write(Integer.toString(endOffset));
+      newLine();
+    }
+  }
+
+  @Override
+  public void abort() {
+    try {
+      close();
+    } catch (IOException ignored) {}
+    
+    try {
+      directory.deleteFile(IndexFileNames.segmentFileName(segment, "", VECTORS_EXTENSION));
+    } catch (IOException ignored) {}
+  }
+
+  @Override
+  public void finish(int numDocs) throws IOException {
+    if (numDocsWritten != numDocs) {
+      throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + numDocs + " but vec numDocs is " + numDocsWritten + " file=" + out.toString() + "; now aborting this merge to prevent index corruption");
+    }
+    write(END);
+    newLine();
+  }
+  
+  @Override
+  public void close() throws IOException {
+    try {
+      IOUtils.close(out);
+    } finally {
+      out = null;
+    }
+  }
+  
+  private void write(String s) throws IOException {
+    SimpleTextUtil.write(out, s, scratch);
+  }
+  
+  private void write(BytesRef bytes) throws IOException {
+    SimpleTextUtil.write(out, bytes);
+  }
+  
+  private void newLine() throws IOException {
+    SimpleTextUtil.writeNewline(out);
+  }
+}

Modified: lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1202305&r1=1202304&r2=1202305&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Tue Nov 15 17:05:56 2011
@@ -1075,7 +1075,7 @@ public class TestIndexWriter extends Luc
     assertEquals(100, dpEnum.nextPosition());
 
     assertNotNull(termsEnum.next());
-    termsEnum.docsAndPositions(null, dpEnum);
+    dpEnum = termsEnum.docsAndPositions(null, dpEnum);
     assertNotNull(dpEnum);
     assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
     assertEquals(1, dpEnum.freq());

Modified: lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java?rev=1202305&r1=1202304&r2=1202305&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java Tue Nov 15 17:05:56 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.document.FieldT
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.DefaultTermVectorsReader;
+import org.apache.lucene.index.codecs.TermVectorsReader;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@@ -197,7 +198,7 @@ public class TestTermVectorsReader exten
   }
 
   public void testReader() throws IOException {
-    DefaultTermVectorsReader reader = new DefaultTermVectorsReader(dir, seg, fieldInfos, newIOContext(random));
+    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg, fieldInfos, newIOContext(random));
     for (int j = 0; j < 5; j++) {
       Terms vector = reader.get(j).terms(testFields[0]);
       assertNotNull(vector);
@@ -216,7 +217,7 @@ public class TestTermVectorsReader exten
   }
 
   public void testPositionReader() throws IOException {
-    DefaultTermVectorsReader reader = new DefaultTermVectorsReader(dir, seg, fieldInfos, newIOContext(random));
+    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg, fieldInfos, newIOContext(random));
     BytesRef[] terms;
     Terms vector = reader.get(0).terms(testFields[0]);
     assertNotNull(vector);
@@ -269,7 +270,7 @@ public class TestTermVectorsReader exten
   }
 
   public void testOffsetReader() throws IOException {
-    DefaultTermVectorsReader reader = new DefaultTermVectorsReader(dir, seg, fieldInfos, newIOContext(random));
+    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg, fieldInfos, newIOContext(random));
     Terms vector = reader.get(0).terms(testFields[0]);
     assertNotNull(vector);
     TermsEnum termsEnum = vector.iterator(null);
@@ -311,9 +312,9 @@ public class TestTermVectorsReader exten
    * Make sure exceptions and bad params are handled appropriately
    */
   public void testBadParams() throws IOException {
-    DefaultTermVectorsReader reader = null;
+    TermVectorsReader reader = null;
     try {
-      reader = new DefaultTermVectorsReader(dir, seg, fieldInfos, newIOContext(random));
+      reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg, fieldInfos, newIOContext(random));
       //Bad document number, good field number
       reader.get(50);
       fail();
@@ -322,7 +323,7 @@ public class TestTermVectorsReader exten
     } finally {
       reader.close();
     }
-    reader = new DefaultTermVectorsReader(dir, seg, fieldInfos, newIOContext(random));
+    reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg, fieldInfos, newIOContext(random));
     //good document number, bad field
     Terms vector = reader.get(0).terms("f50");
     assertNull(vector);