You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2007/07/23 05:17:27 UTC

svn commit: r558592 - in /lucene/java/trunk: ./ contrib/memory/src/java/org/apache/lucene/index/memory/ src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/ src/test/org/apache/lucene/search/

Author: gsingers
Date: Sun Jul 22 20:17:25 2007
New Revision: 558592

URL: http://svn.apache.org/viewvc?view=rev&rev=558592
Log:
LUCENE-868: New Term Vector access mechanism.  Allows for applications to define how they access term vector information instead of having to pack/unpack the TV info returned by the old way.

Added:
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/index/SortedTermVectorMapper.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntry.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorMapper.java   (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexReader.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestTermVectorsReader.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Jul 22 20:17:25 2007
@@ -54,6 +54,10 @@
 
  2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
 
+ 3. LUCENE-868: Added new Term Vector access features.  New callback mechanism allows application to define how and where to read Term Vectors from disk.
+    This implementation contains several extensions of the new abstract TermVectorMapper class.  The new API should be back-compatible.  No changes in the
+     actual storage of Term Vectors has taken place.
+
 Optimizations
 
  1. LUCENE-937: CachingTokenFilter now uses an iterator to access the 

Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Sun Jul 22 20:17:25 2007
@@ -17,6 +17,16 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
@@ -30,22 +40,13 @@
 import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.index.TermPositionVector;
 import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.TermVectorMapper;
 import org.apache.lucene.search.HitCollector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.Similarity;
 
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
 /**
  * High-performance single-document main memory Apache Lucene fulltext search index. 
  * 
@@ -935,8 +936,47 @@
       }
       return vectors;
     }
-    
-    public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
+
+      public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException
+      {
+          if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors");
+
+    //      if (vectors.length == 0) return null;
+          for (Iterator iterator = fields.keySet().iterator(); iterator.hasNext();)
+          {
+            String fieldName = (String) iterator.next();
+            getTermFreqVector(docNumber, fieldName, mapper);
+          }
+      }
+
+      public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException
+      {
+        if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
+        final Info info = getInfo(field);
+          if (info == null){
+              return;
+          }
+          info.sortTerms();
+          mapper.setExpectations(field, info.sortedTerms.length, stride != 1, true);
+          for (int i = info.sortedTerms.length; --i >=0;){
+
+              ArrayIntList positions = (ArrayIntList) info.sortedTerms[i].getValue();
+              int size = positions.size();
+              org.apache.lucene.index.TermVectorOffsetInfo[] offsets =
+                new org.apache.lucene.index.TermVectorOffsetInfo[size / stride];
+
+              for (int k=0, j=1; j < size; k++, j += stride) {
+                int start = positions.get(j);
+                int end = positions.get(j+1);
+                offsets[k] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end);
+              }
+              mapper.map((String)info.sortedTerms[i].getKey(),
+                         numPositions((ArrayIntList) info.sortedTerms[i].getValue()),
+                         offsets, ((ArrayIntList) info.sortedTerms[i].getValue()).toArray(stride));
+          }
+      }
+
+      public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
       if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
       final Info info = getInfo(fieldName);
       if (info == null) return null; // TODO: or return empty vector impl???

Added: lucene/java/trunk/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java?view=auto&rev=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java Sun Jul 22 20:17:25 2007
@@ -0,0 +1,70 @@
+package org.apache.lucene.index;
+
+import java.util.*;
+
+/**
+ * Copyright 2007 The Apache Software Foundation
+ * <p/>
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * For each Field, store a sorted collection of {@link TermVectorEntry}s
+ * <p/>
+ * This is not thread-safe.
+ */
+public class FieldSortedTermVectorMapper extends TermVectorMapper{
+  private Map fieldToTerms = new HashMap();
+  private SortedSet currentSet;
+  private String currentField;
+  private Comparator comparator;
+
+  /**
+   *
+   * @param comparator A Comparator for sorting {@link TermVectorEntry}s
+   */
+  public FieldSortedTermVectorMapper(Comparator comparator) {
+    this(false, false, comparator);
+  }
+
+
+  public FieldSortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
+    super(ignoringPositions, ignoringOffsets);
+    this.comparator = comparator;
+  }
+
+  public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+    TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions);
+    currentSet.add(entry);
+  }
+
+  public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+    currentSet = new TreeSet(comparator);
+    currentField = field;
+    fieldToTerms.put(field, currentSet);
+  }
+
+  /**
+   * Get the mapping between fields and terms, sorted by the comparator
+   *
+   * @return A map between field names and {@link java.util.SortedSet}s per field.  SortedSet entries are {@link TermVectorEntry}
+   */
+  public Map getFieldToTerms() {
+    return fieldToTerms;
+  }
+
+
+  public Comparator getComparator() {
+    return comparator;
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FilterIndexReader.java Sun Jul 22 20:17:25 2007
@@ -115,6 +115,18 @@
     return in.getTermFreqVector(docNumber, field);
   }
 
+
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    in.getTermFreqVector(docNumber, field, mapper);
+
+  }
+
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    in.getTermFreqVector(docNumber, mapper);
+  }
+
   public int numDocs() {
     // Don't call ensureOpen() here (it could affect performance)
     return in.numDocs();

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java Sun Jul 22 20:17:25 2007
@@ -20,12 +20,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.FieldSelector;
 import org.apache.lucene.search.Similarity;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.Lock;
-import org.apache.lucene.store.LockObtainFailedException;
-import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.store.*;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -384,6 +379,25 @@
    */
   abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
           throws IOException;
+
+  /**
+   * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
+   * the {@link TermFreqVector}.
+   * @param docNumber The number of the document to load the vector for
+   * @param field The name of the field to load
+   * @param mapper The {@link TermVectorMapper} to process the vector.  Must not be null
+   * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
+   * 
+   */
+  abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
+
+  /**
+   * Map all the term vectors for all fields in a Document
+   * @param docNumber The number of the document to load the vector for
+   * @param mapper The {@link TermVectorMapper} to process the vector.  Must not be null
+   * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
+   */
+  abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
 
   /**
    * Returns <code>true</code> if an index exists at the specified directory.

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MultiReader.java Sun Jul 22 20:17:25 2007
@@ -85,6 +85,19 @@
     return subReaders[i].getTermFreqVector(n - starts[i], field);
   }
 
+
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    int i = readerIndex(docNumber);        // find segment num
+    subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper);
+  }
+
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    int i = readerIndex(docNumber);        // find segment num
+    subReaders[i].getTermFreqVector(docNumber - starts[i], mapper);
+  }
+
   public synchronized int numDocs() {
     // Don't call ensureOpen() here (it could affect performance)
     if (numDocs == -1) {        // check cache

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/ParallelReader.java Sun Jul 22 20:17:25 2007
@@ -194,6 +194,29 @@
     return reader==null ? null : reader.getTermFreqVector(n, field);
   }
 
+
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader != null) {
+      reader.getTermFreqVector(docNumber, field, mapper); 
+    }
+  }
+
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    ensureOpen();
+
+    Iterator i = fieldToReader.entrySet().iterator();
+    while (i.hasNext()) {
+      Map.Entry e = (Map.Entry)i.next();
+      String field = (String)e.getKey();
+      IndexReader reader = (IndexReader)e.getValue();
+      reader.getTermFreqVector(docNumber, field, mapper);
+    }
+
+  }
+
   public boolean hasNorms(String field) throws IOException {
     ensureOpen();
     IndexReader reader = ((IndexReader)fieldToReader.get(field));

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java Sun Jul 22 20:17:25 2007
@@ -20,10 +20,10 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.FieldSelector;
 import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.store.BufferedIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.BufferedIndexInput;
 import org.apache.lucene.util.BitVector;
 
 import java.io.IOException;
@@ -642,6 +642,35 @@
     return termVectorsReader.get(docNumber, field);
   }
 
+
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    FieldInfo fi = fieldInfos.fieldInfo(field);
+    if (fi == null || !fi.storeTermVector || termVectorsReaderOrig == null)
+      throw new IOException("field does not contain term vectors");
+
+    TermVectorsReader termVectorsReader = getTermVectorsReader();
+    if (termVectorsReader == null)
+    {
+      throw new IOException("Cannot open a reader for the term vectors");
+    }
+
+
+    termVectorsReader.get(docNumber, field, mapper);
+  }
+
+
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    if (termVectorsReaderOrig == null)
+      return;
+
+    TermVectorsReader termVectorsReader = getTermVectorsReader();
+    if (termVectorsReader == null)
+      return;
+
+    termVectorsReader.get(docNumber, mapper);
+  }
 
   /** Return an array of term frequency vectors for the specified document.
    *  The array contains a vector for each vectorized field in the document.

Added: lucene/java/trunk/src/java/org/apache/lucene/index/SortedTermVectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SortedTermVectorMapper.java?view=auto&rev=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SortedTermVectorMapper.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SortedTermVectorMapper.java Sun Jul 22 20:17:25 2007
@@ -0,0 +1,129 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+
+/**
+ * Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s.  Collects all term information
+ * into a single, SortedSet.
+ * <br/>
+ * NOTE: This Mapper ignores all Field information for the Document.  This means that if you are using offset/positions you will not
+ * know what Fields they correlate with.
+ *  <br/>
+ * This is not thread-safe  
+ */
+public class SortedTermVectorMapper extends TermVectorMapper{
+
+
+  private SortedSet currentSet;
+  private Map termToTVE = new HashMap();
+  private boolean storeOffsets;
+  private boolean storePositions;
+  /**
+   * Stand-in name for the field in {@link TermVectorEntry}.
+   */
+  public static final String ALL = "_ALL_";
+
+  /**
+   *
+   * @param comparator A Comparator for sorting {@link TermVectorEntry}s
+   */
+  public SortedTermVectorMapper(Comparator comparator) {
+    this(false, false, comparator);
+  }
+
+
+  public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
+    super(ignoringPositions, ignoringOffsets);
+    currentSet = new TreeSet(comparator);
+  }
+
+  /**
+   *
+   * @param term The term to map
+   * @param frequency The frequency of the term
+   * @param offsets Offset information, may be null
+   * @param positions Position information, may be null
+   */
+  //We need to combine any previous mentions of the term
+  public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+    TermVectorEntry entry = (TermVectorEntry) termToTVE.get(term);
+    if (entry == null) {
+      entry = new TermVectorEntry(ALL, term, frequency, 
+              storeOffsets == true ? offsets : null,
+              storePositions == true ? positions : null);
+      termToTVE.put(term, entry);
+      currentSet.add(entry);
+    } else {
+      entry.setFrequency(entry.getFrequency() + frequency);
+      if (storeOffsets)
+      {
+        TermVectorOffsetInfo [] existingOffsets = entry.getOffsets();
+        //A few diff. cases here:  offsets is null, existing offsets is null, both are null, same for positions
+        if (existingOffsets != null && offsets != null && offsets.length > 0)
+        {
+          //copy over the existing offsets
+          TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length];
+          System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length);
+          System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length);
+          entry.setOffsets(newOffsets);
+        }
+        else if (existingOffsets == null && offsets != null && offsets.length > 0)
+        {
+          entry.setOffsets(offsets);
+        }
+        //else leave it alone
+      }
+      if (storePositions)
+      {
+        int [] existingPositions = entry.getPositions();
+        if (existingPositions != null && positions != null && positions.length > 0)
+        {
+          int [] newPositions = new int[existingPositions.length + positions.length];
+          System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length);
+          System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length);
+          entry.setPositions(newPositions);
+        }
+        else if (existingPositions == null && positions != null && positions.length > 0)
+        {
+          entry.setPositions(positions);
+        }
+      }
+    }
+
+
+  }
+
+  public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+
+    this.storeOffsets = storeOffsets;
+    this.storePositions = storePositions;
+  }
+
+  /**
+   * The TermVectorEntrySet.  A SortedSet of {@link TermVectorEntry} objects.  Sort is by the comparator passed into the constructor.
+   *<br/>
+   * This set will be empty until after the mapping process takes place.
+   *
+   * @return The SortedSet of {@link TermVectorEntry}.
+   */
+  public SortedSet getTermVectorEntrySet()
+  {
+    return currentSet;
+  }
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/SortedTermVectorMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntry.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntry.java?view=auto&rev=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntry.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntry.java Sun Jul 22 20:17:25 2007
@@ -0,0 +1,98 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2007 The Apache Software Foundation
+ * <p/>
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Convenience class for holding TermVector information.
+ */
+public class TermVectorEntry {
+  private String field;
+  private String term;
+  private int frequency;
+  private TermVectorOffsetInfo [] offsets;
+  int [] positions;
+
+
+  public TermVectorEntry() {
+  }
+
+  public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+    this.field = field;
+    this.term = term;
+    this.frequency = frequency;
+    this.offsets = offsets;
+    this.positions = positions;
+  }
+
+
+  public String getField() {
+    return field;
+  }
+
+  public int getFrequency() {
+    return frequency;
+  }
+
+  public TermVectorOffsetInfo[] getOffsets() {
+    return offsets;
+  }
+
+  public int[] getPositions() {
+    return positions;
+  }
+
+  public String getTerm() {
+    return term;
+  }
+
+  //Keep package local
+  void setFrequency(int frequency) {
+    this.frequency = frequency;
+  }
+
+  void setOffsets(TermVectorOffsetInfo[] offsets) {
+    this.offsets = offsets;
+  }
+
+  void setPositions(int[] positions) {
+    this.positions = positions;
+  }
+
+
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    TermVectorEntry that = (TermVectorEntry) o;
+
+    if (term != null ? !term.equals(that.term) : that.term != null) return false;
+
+    return true;
+  }
+
+  public int hashCode() {
+    return (term != null ? term.hashCode() : 0);
+  }
+
+  public String toString() {
+    return "TermVectorEntry{" +
+            "field='" + field + '\'' +
+            ", term='" + term + '\'' +
+            ", frequency=" + frequency +
+            '}';
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntry.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java?view=auto&rev=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java Sun Jul 22 20:17:25 2007
@@ -0,0 +1,42 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Comparator;
+
+/**
+ * Compares {@link org.apache.lucene.index.TermVectorEntry}s first by frequency and then by
+ * the term (case-sensitive)
+ *
+ **/
+public class TermVectorEntryFreqSortedComparator implements Comparator {
+  public int compare(Object object, Object object1) {
+    int result = 0;
+    TermVectorEntry entry = (TermVectorEntry) object;
+    TermVectorEntry entry1 = (TermVectorEntry) object1;
+    result = entry1.getFrequency() - entry.getFrequency();
+    if (result == 0)
+    {
+      result = entry.getTerm().compareTo(entry1.getTerm());
+      if (result == 0)
+      {
+        result = entry.getField().compareTo(entry1.getField());
+      }
+    }
+    return result;
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorMapper.java?view=auto&rev=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorMapper.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorMapper.java Sun Jul 22 20:17:25 2007
@@ -0,0 +1,88 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * The TermVectorMapper can be used to map Term Vectors into your own
+ * structure instead of the parallel array structure used by
+ * {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
+ * <p/>
+ * It is up to the implementation to make sure it is thread-safe.
+ *
+ *
+ **/
+public abstract class TermVectorMapper {
+
+  private boolean ignoringPositions;
+  private boolean ignoringOffsets;
+
+
+  protected TermVectorMapper() {
+  }
+
+  /**
+   *
+   * @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored
+   * @param ignoringOffsets similar to ignoringPositions
+   */
+  protected TermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets) {
+    this.ignoringPositions = ignoringPositions;
+    this.ignoringOffsets = ignoringOffsets;
+  }
+
+  /**
+   * Tell the mapper what to expect in regards to field, number of terms, offset and position storage.
+   * This method will be called once before retrieving the vector for a field.
+   *
+   * This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}.
+   * @param field The field the vector is for
+   * @param numTerms The number of terms that need to be mapped
+   * @param storeOffsets true if the mapper should expect offset information
+   * @param storePositions true if the mapper should expect positions info
+   */
+  public abstract void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions);
+  /**
+   * Map the Term Vector information into your own structure
+   * @param term The term to add to the vector
+   * @param frequency The frequency of the term in the document
+   * @param offsets null if the offset is not specified, otherwise the offset into the field of the term
+   * @param positions null if the position is not specified, otherwise the position in the field of the term
+   */
+  public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions);
+
+  /**
+   * Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they
+   * can be skipped over.  Derived classes should set this to true if they want to ignore positions.  The default
+   * is false, meaning positions will be loaded if they are stored.
+   * @return false
+   */
+  public boolean isIgnoringPositions()
+  {
+    return ignoringPositions;
+  }
+
+  /**
+   *
+   * @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets.  false by default.
+   * @return false
+   */
+  public boolean isIgnoringOffsets()
+  {
+    return ignoringOffsets;
+  }
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java Sun Jul 22 20:17:25 2007
@@ -17,9 +17,9 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.store.BufferedIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.BufferedIndexInput;
 
 import java.io.IOException;
 
@@ -104,18 +104,9 @@
     return size;
   }
 
-  /**
-   * Retrieve the term vector for the given document and field
-   * @param docNum The document number to retrieve the vector for
-   * @param field The field within the document to retrieve
-   * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
-   * @throws IOException if there is an error reading the term vector files
-   */ 
-  TermFreqVector get(int docNum, String field) throws IOException {
-    // Check if no term vectors are available for this segment at all
-    int fieldNumber = fieldInfos.fieldNumber(field);
-    TermFreqVector result = null;
+  public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
     if (tvx != null) {
+      int fieldNumber = fieldInfos.fieldNumber(field);
       //We need to account for the FORMAT_SIZE at when seeking in the tvx
       //We don't need to do this in other seeks because we already have the
       // file pointer
@@ -137,7 +128,7 @@
           number = tvd.readVInt();
         else
           number += tvd.readVInt();
-        
+
         if (number == fieldNumber)
           found = i;
       }
@@ -150,14 +141,30 @@
         for (int i = 0; i <= found; i++)
           position += tvd.readVLong();
 
-        result = readTermVector(field, position);
+        readTermVector(field, position, mapper);
       } else {
         //System.out.println("Fieldable not found");
       }
     } else {
       //System.out.println("No tvx file");
     }
-    return result;
+  }
+
+
+
+  /**
+   * Retrieve the term vector for the given document and field
+   * @param docNum The document number to retrieve the vector for
+   * @param field The field within the document to retrieve
+   * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
+   * @throws IOException if there is an error reading the term vector files
+   */ 
+  TermFreqVector get(int docNum, String field) throws IOException {
+    // Check if no term vectors are available for this segment at all
+    ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
+    get(docNum, field, mapper);
+
+    return mapper.materializeVector();
   }
 
   /**
@@ -169,7 +176,6 @@
    */
   TermFreqVector[] get(int docNum) throws IOException {
     TermFreqVector[] result = null;
-    // Check if no term vectors are available for this segment at all
     if (tvx != null) {
       //We need to offset by
       tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
@@ -182,7 +188,7 @@
       if (fieldCount != 0) {
         int number = 0;
         String[] fields = new String[fieldCount];
-        
+
         for (int i = 0; i < fieldCount; i++) {
           if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
             number = tvd.readVInt();
@@ -208,24 +214,76 @@
     return result;
   }
 
+  public void get(int docNumber, TermVectorMapper mapper) throws IOException {
+    // Check if no term vectors are available for this segment at all
+    if (tvx != null) {
+      //We need to offset by
+      tvx.seek((docNumber * 8L) + TermVectorsWriter.FORMAT_SIZE);
+      long position = tvx.readLong();
+
+      tvd.seek(position);
+      int fieldCount = tvd.readVInt();
+
+      // No fields are vectorized for this document
+      if (fieldCount != 0) {
+        int number = 0;
+        String[] fields = new String[fieldCount];
+
+        for (int i = 0; i < fieldCount; i++) {
+          if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
+            number = tvd.readVInt();
+          else
+            number += tvd.readVInt();
+
+          fields[i] = fieldInfos.fieldName(number);
+        }
+
+        // Compute position in the tvf file
+        position = 0;
+        long[] tvfPointers = new long[fieldCount];
+        for (int i = 0; i < fieldCount; i++) {
+          position += tvd.readVLong();
+          tvfPointers[i] = position;
+        }
+
+        readTermVectors(fields, tvfPointers, mapper);
+      }
+    } else {
+      //System.out.println("No tvx file");
+    }
+  }
+
 
   private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
           throws IOException {
     SegmentTermVector res[] = new SegmentTermVector[fields.length];
     for (int i = 0; i < fields.length; i++) {
-      res[i] = readTermVector(fields[i], tvfPointers[i]);
+      ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
+       readTermVector(fields[i], tvfPointers[i], mapper);
+      res[i] = (SegmentTermVector) mapper.materializeVector();
     }
     return res;
   }
 
+  private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
+          throws IOException {
+    for (int i = 0; i < fields.length; i++) {
+       readTermVector(fields[i], tvfPointers[i], mapper);
+    }
+
+  }
+
+
   /**
    * 
    * @param field The field to read in
    * @param tvfPointer The pointer within the tvf file where we should start reading
+   * @param mapper The mapper used to map the TermVector
    * @return The TermVector located at that position
    * @throws IOException
+
    */ 
-  private SegmentTermVector readTermVector(String field, long tvfPointer)
+  private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
           throws IOException {
 
     // Now read the data from specified position
@@ -236,7 +294,7 @@
     //System.out.println("Num Terms: " + numTerms);
     // If no terms - return a constant empty termvector. However, this should never occur!
     if (numTerms == 0) 
-      return new SegmentTermVector(field, null, null);
+      return;
     
     boolean storePositions;
     boolean storeOffsets;
@@ -251,18 +309,7 @@
       storePositions = false;
       storeOffsets = false;
     }
-
-    String terms[] = new String[numTerms];
-    int termFreqs[] = new int[numTerms];
-    
-    //  we may not need these, but declare them
-    int positions[][] = null;
-    TermVectorOffsetInfo offsets[][] = null;
-    if(storePositions)
-      positions = new int[numTerms][];
-    if(storeOffsets)
-      offsets = new TermVectorOffsetInfo[numTerms][];
-    
+    mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
     int start = 0;
     int deltaLength = 0;
     int totalLength = 0;
@@ -282,45 +329,54 @@
       }
       
       tvf.readChars(buffer, start, deltaLength);
-      terms[i] = new String(buffer, 0, totalLength);
+      String term = new String(buffer, 0, totalLength);
       previousBuffer = buffer;
       int freq = tvf.readVInt();
-      termFreqs[i] = freq;
-      
+      int [] positions = null;
       if (storePositions) { //read in the positions
-        int [] pos = new int[freq];
-        positions[i] = pos;
-        int prevPosition = 0;
-        for (int j = 0; j < freq; j++)
-        {
-          pos[j] = prevPosition + tvf.readVInt();
-          prevPosition = pos[j];
+        //does the mapper even care about positions?
+        if (mapper.isIgnoringPositions() == false) {
+          positions = new int[freq];
+          int prevPosition = 0;
+          for (int j = 0; j < freq; j++)
+          {
+            positions[j] = prevPosition + tvf.readVInt();
+            prevPosition = positions[j];
+          }
+        } else {
+          //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
+          //
+          for (int j = 0; j < freq; j++)
+          {
+            tvf.readVInt();
+          }
         }
       }
-      
+      TermVectorOffsetInfo[] offsets = null;
       if (storeOffsets) {
-        TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
-        offsets[i] = offs;
-        int prevOffset = 0;
-        for (int j = 0; j < freq; j++) {
-          int startOffset = prevOffset + tvf.readVInt();
-          int endOffset = startOffset + tvf.readVInt();
-          offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
-          prevOffset = endOffset;
+        //does the mapper even care about offsets?
+        if (mapper.isIgnoringOffsets() == false) {
+          offsets = new TermVectorOffsetInfo[freq];
+          int prevOffset = 0;
+          for (int j = 0; j < freq; j++) {
+            int startOffset = prevOffset + tvf.readVInt();
+            int endOffset = startOffset + tvf.readVInt();
+            offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
+            prevOffset = endOffset;
+          }
+        } else {
+          for (int j = 0; j < freq; j++){
+            tvf.readVInt();
+            tvf.readVInt();
+          }
         }
       }
+      mapper.map(term, freq, offsets, positions);
     }
-    
-    SegmentTermVector tv;
-    if (storePositions || storeOffsets){
-      tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
-    }
-    else {
-      tv = new SegmentTermVector(field, terms, termFreqs);
-    }
-    return tv;
   }
 
+
+
   protected Object clone() {
     
     if (tvx == null || tvd == null || tvf == null)
@@ -337,4 +393,67 @@
     
     return clone;
   }
+
+
+
 }
+
+/**
+ * Models the existing parallel array structure
+ */
+class ParallelArrayTermVectorMapper extends TermVectorMapper
+{
+
+  private int numTerms;
+  private String[] terms;
+  private int[] termFreqs;
+  private int positions[][] = null;
+  private TermVectorOffsetInfo offsets[][] = null;
+  private int currentPosition;
+  private boolean storingOffsets;
+  private boolean storingPositions;
+  private String field;
+
+  public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+    this.numTerms = numTerms;
+    this.field = field;
+    terms = new String[numTerms];
+    termFreqs = new int[numTerms];
+    this.storingOffsets = storeOffsets;
+    this.storingPositions = storePositions;
+    if(storePositions)
+      this.positions = new int[numTerms][];
+    if(storeOffsets)
+      this.offsets = new TermVectorOffsetInfo[numTerms][];
+  }
+
+  public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+    terms[currentPosition] = term;
+    termFreqs[currentPosition] = frequency;
+    if (storingOffsets)
+    {
+      this.offsets[currentPosition] = offsets;
+    }
+    if (storingPositions)
+    {
+      this.positions[currentPosition] = positions; 
+    }
+    currentPosition++;
+  }
+
+  /**
+   * Construct the vector
+   * @return
+   */
+  public TermFreqVector materializeVector() {
+    SegmentTermVector tv = null;
+    if (field != null && terms != null) {
+      if (storingPositions || storingOffsets) {
+        tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
+      } else {
+        tv = new SegmentTermVector(field, terms, termFreqs);
+      }
+    }
+    return tv;
+  }
+}
\ No newline at end of file

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexReader.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexReader.java Sun Jul 22 20:17:25 2007
@@ -21,29 +21,20 @@
 import junit.framework.TestCase;
 import junit.framework.TestSuite;
 import junit.textui.TestRunner;
-
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.LockObtainFailedException;
-import org.apache.lucene.store.AlreadyClosedException;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-
-import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.*;
 import org.apache.lucene.util._TestUtil;
 
-import java.util.Collection;
-import java.util.Arrays;
-import java.io.IOException;
-import java.io.FileNotFoundException;
 import java.io.File;
-
-import org.apache.lucene.store.MockRAMDirectory;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.*;
 
 public class TestIndexReader extends TestCase
 {
@@ -180,8 +171,43 @@
         d.close();
     }
 
+  public void testTermVectors() throws Exception {
+    RAMDirectory d = new MockRAMDirectory();
+    // set up writer
+    IndexWriter writer = new IndexWriter(d, new StandardAnalyzer(), true);
+    // want to get some more segments here
+    // new termvector fields
+    for (int i = 0; i < 5 * writer.getMergeFactor(); i++) {
+      Document doc = new Document();
+        doc.add(new Field("tvnot","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO));
+        doc.add(new Field("termvector","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+        doc.add(new Field("tvoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS));
+        doc.add(new Field("tvposition","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+        doc.add(new Field("tvpositionoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+
+        writer.addDocument(doc);
+    }
+    writer.close();
+    IndexReader reader = IndexReader.open(d);
+    FieldSortedTermVectorMapper mapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+    reader.getTermFreqVector(0, mapper);
+    Map map = mapper.getFieldToTerms();
+    assertTrue("map is null and it shouldn't be", map != null);
+    assertTrue("map Size: " + map.size() + " is not: " + 4, map.size() == 4);
+    Set set = (Set) map.get("termvector");
+    for (Iterator iterator = set.iterator(); iterator.hasNext();) {
+      TermVectorEntry entry = (TermVectorEntry) iterator.next();
+      assertTrue("entry is null and it shouldn't be", entry != null);
+      System.out.println("Entry: " + entry);
+    }
+
+
+
+
+    
+  }
 
-    private void assertTermDocsCount(String msg,
+  private void assertTermDocsCount(String msg,
                                      IndexReader reader,
                                      Term term,
                                      int expected)

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestTermVectorsReader.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestTermVectorsReader.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestTermVectorsReader.java Sun Jul 22 20:17:25 2007
@@ -22,16 +22,19 @@
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.SortedSet;
 
 public class TestTermVectorsReader extends TestCase {
   private TermVectorsWriter writer = null;
   //Must be lexicographically sorted, will do in setup, versus trying to maintain here
-  private String [] testFields = {"f1", "f2", "f3"};
-  private boolean [] testFieldsStorePos = {true, false, true, false};
-  private boolean [] testFieldsStoreOff = {true, false, false, true};  
-  private String [] testTerms = {"this", "is", "a", "test"};
-  private int [][] positions = new int[testTerms.length][];
-  private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
+  private String[] testFields = {"f1", "f2", "f3", "f4"};
+  private boolean[] testFieldsStorePos = {true, false, true, false};
+  private boolean[] testFieldsStoreOff = {true, false, false, true};
+  private String[] testTerms = {"this", "is", "a", "test"};
+  private int[][] positions = new int[testTerms.length][];
+  private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
   private RAMDirectory dir = new RAMDirectory();
   private String seg = "testSegment";
   private FieldInfos fieldInfos = new FieldInfos();
@@ -44,35 +47,37 @@
     for (int i = 0; i < testFields.length; i++) {
       fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
     }
-    
-    for (int i = 0; i < testTerms.length; i++)
-    {
+
+    for (int i = 0; i < testTerms.length; i++) {
       positions[i] = new int[3];
       for (int j = 0; j < positions[i].length; j++) {
         // poditions are always sorted in increasing order
-        positions[i][j] = (int)(j * 10 + Math.random() * 10);
+        positions[i][j] = (int) (j * 10 + Math.random() * 10);
       }
       offsets[i] = new TermVectorOffsetInfo[3];
-      for (int j = 0; j < offsets[i].length; j++){
+      for (int j = 0; j < offsets[i].length; j++) {
         // ofsets are alway sorted in increasing order
         offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
-      }        
+      }
     }
     Arrays.sort(testTerms);
+    //Create 5 documents for testing, they all have the same terms
+    writer = new TermVectorsWriter(dir, seg, fieldInfos);
     for (int j = 0; j < 5; j++) {
-      writer = new TermVectorsWriter(dir, seg, fieldInfos);
+
       writer.openDocument();
 
       for (int k = 0; k < testFields.length; k++) {
         writer.openField(testFields[k]);
         for (int i = 0; i < testTerms.length; i++) {
-          writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);      
+          writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
         }
         writer.closeField();
       }
       writer.closeDocument();
-      writer.close();
+
     }
+    writer.close();
   }
 
   protected void tearDown() {
@@ -80,34 +85,38 @@
   }
 
   public void test() {
-      //Check to see the files were created properly in setup
-      assertTrue(writer.isDocumentOpen() == false);          
-      assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
-      assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
+    //Check to see the files were created properly in setup
+    assertTrue(writer.isDocumentOpen() == false);
+    assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
+    assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
   }
-  
+
   public void testReader() throws IOException {
     TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
     assertTrue(reader != null);
-    TermFreqVector vector = reader.get(0, testFields[0]);
-    assertTrue(vector != null);
-    String [] terms = vector.getTerms();
-    assertTrue(terms != null);
-    assertTrue(terms.length == testTerms.length);
-    for (int i = 0; i < terms.length; i++) {
-      String term = terms[i];
-      //System.out.println("Term: " + term);
-      assertTrue(term.equals(testTerms[i]));
+    for (int j = 0; j < 5; j++) {
+      TermFreqVector vector = reader.get(j, testFields[0]);
+      assertTrue(vector != null);
+      String[] terms = vector.getTerms();
+      assertTrue(terms != null);
+      assertTrue(terms.length == testTerms.length);
+      for (int i = 0; i < terms.length; i++) {
+        String term = terms[i];
+        //System.out.println("Term: " + term);
+        assertTrue(term.equals(testTerms[i]));
+      }
     }
-  }  
-  
+
+
+  }
+
   public void testPositionReader() throws IOException {
     TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
     assertTrue(reader != null);
     TermPositionVector vector;
-    String [] terms;
-    vector = (TermPositionVector)reader.get(0, testFields[0]);
-    assertTrue(vector != null);      
+    String[] terms;
+    vector = (TermPositionVector) reader.get(0, testFields[0]);
+    assertTrue(vector != null);
     terms = vector.getTerms();
     assertTrue(terms != null);
     assertTrue(terms.length == testTerms.length);
@@ -115,14 +124,14 @@
       String term = terms[i];
       //System.out.println("Term: " + term);
       assertTrue(term.equals(testTerms[i]));
-      int [] positions = vector.getTermPositions(i);
+      int[] positions = vector.getTermPositions(i);
       assertTrue(positions != null);
       assertTrue(positions.length == this.positions[i].length);
       for (int j = 0; j < positions.length; j++) {
         int position = positions[j];
         assertTrue(position == this.positions[i][j]);
       }
-      TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+      TermVectorOffsetInfo[] offset = vector.getOffsets(i);
       assertTrue(offset != null);
       assertTrue(offset.length == this.offsets[i].length);
       for (int j = 0; j < offset.length; j++) {
@@ -130,9 +139,9 @@
         assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
       }
     }
-    
+
     TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
-    assertTrue(freqVector != null);      
+    assertTrue(freqVector != null);
     assertTrue(freqVector instanceof TermPositionVector == false);
     terms = freqVector.getTerms();
     assertTrue(terms != null);
@@ -140,30 +149,30 @@
     for (int i = 0; i < terms.length; i++) {
       String term = terms[i];
       //System.out.println("Term: " + term);
-      assertTrue(term.equals(testTerms[i]));        
+      assertTrue(term.equals(testTerms[i]));
     }
   }
-  
+
   public void testOffsetReader() throws IOException {
     TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
     assertTrue(reader != null);
-    TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
+    TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
     assertTrue(vector != null);
-    String [] terms = vector.getTerms();
+    String[] terms = vector.getTerms();
     assertTrue(terms != null);
     assertTrue(terms.length == testTerms.length);
     for (int i = 0; i < terms.length; i++) {
       String term = terms[i];
       //System.out.println("Term: " + term);
       assertTrue(term.equals(testTerms[i]));
-      int [] positions = vector.getTermPositions(i);
+      int[] positions = vector.getTermPositions(i);
       assertTrue(positions != null);
       assertTrue(positions.length == this.positions[i].length);
       for (int j = 0; j < positions.length; j++) {
         int position = positions[j];
         assertTrue(position == this.positions[i][j]);
       }
-      TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+      TermVectorOffsetInfo[] offset = vector.getOffsets(i);
       assertTrue(offset != null);
       assertTrue(offset.length == this.offsets[i].length);
       for (int j = 0; j < offset.length; j++) {
@@ -172,18 +181,112 @@
       }
     }
   }
-  
+
+  public void testMapper() throws IOException {
+    TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+    assertTrue(reader != null);
+    SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+    reader.get(0, mapper);
+    SortedSet set = mapper.getTermVectorEntrySet();
+    assertTrue("set is null and it shouldn't be", set != null);
+    //three fields, 4 terms, all terms are the same
+    assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
+    //Check offsets and positions
+    for (Iterator iterator = set.iterator(); iterator.hasNext();) {
+      TermVectorEntry tve = (TermVectorEntry) iterator.next();
+      assertTrue("tve is null and it shouldn't be", tve != null);
+      assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
+      assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
+
+    }
+
+    mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+    reader.get(1, mapper);
+    set = mapper.getTermVectorEntrySet();
+    assertTrue("set is null and it shouldn't be", set != null);
+    //three fields, 4 terms, all terms are the same
+    assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
+    //Should have offsets and positions b/c we are munging all the fields together
+    for (Iterator iterator = set.iterator(); iterator.hasNext();) {
+      TermVectorEntry tve = (TermVectorEntry) iterator.next();
+      assertTrue("tve is null and it shouldn't be", tve != null);
+      assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
+      assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
+
+    }
+
+
+    FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+    reader.get(0, fsMapper);
+    Map map = fsMapper.getFieldToTerms();
+    assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
+    for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
+      Map.Entry entry = (Map.Entry) iterator.next();
+      SortedSet sortedSet = (SortedSet) entry.getValue();
+      assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
+      for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
+        TermVectorEntry tve = (TermVectorEntry) inner.next();
+        assertTrue("tve is null and it shouldn't be", tve != null);
+        //Check offsets and positions.
+        assertTrue("tve is null and it shouldn't be", tve != null);
+        String field = tve.getField();
+        if (field.equals(testFields[0])) {
+          //should have offsets
+
+          assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
+          assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
+        }
+        else if (field.equals(testFields[1])) {
+          //should not have offsets
+
+          assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
+          assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
+        }
+      }
+    }
+    //Try mapper that ignores offs and positions
+    fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
+    reader.get(0, fsMapper);
+    map = fsMapper.getFieldToTerms();
+    assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
+    for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
+      Map.Entry entry = (Map.Entry) iterator.next();
+      SortedSet sortedSet = (SortedSet) entry.getValue();
+      assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
+      for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
+        TermVectorEntry tve = (TermVectorEntry) inner.next();
+        assertTrue("tve is null and it shouldn't be", tve != null);
+        //Check offsets and positions.
+        assertTrue("tve is null and it shouldn't be", tve != null);
+        String field = tve.getField();
+        if (field.equals(testFields[0])) {
+          //should have offsets
+
+          assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
+          assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
+        }
+        else if (field.equals(testFields[1])) {
+          //should not have offsets
+
+          assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
+          assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
+        }
+      }
+    }
+
+  }
+
 
   /**
    * Make sure exceptions and bad params are handled appropriately
-   */ 
+   */
   public void testBadParams() {
     try {
       TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
       assertTrue(reader != null);
       //Bad document number, good field number
       reader.get(50, testFields[0]);
-      fail();      
+      fail();
     } catch (IOException e) {
       // expected exception
     }
@@ -192,7 +295,7 @@
       assertTrue(reader != null);
       //Bad document number, no field
       reader.get(50);
-      fail();      
+      fail();
     } catch (IOException e) {
       // expected exception
     }
@@ -201,9 +304,9 @@
       assertTrue(reader != null);
       //good document number, bad field number
       TermFreqVector vector = reader.get(0, "f50");
-      assertTrue(vector == null);      
+      assertTrue(vector == null);
     } catch (IOException e) {
       fail();
     }
-  }    
+  }
 }

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java?view=diff&rev=558592&r1=558591&r2=558592
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java Sun Jul 22 20:17:25 2007
@@ -28,7 +28,9 @@
 
 import java.io.IOException;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
+import java.util.SortedSet;
 
 public class TestTermVectors extends TestCase {
   private IndexSearcher searcher;
@@ -171,7 +173,7 @@
       assertTrue(false);
     }
   }
-  
+
   public void testKnownSetOfDocuments() {
     String test1 = "eating chocolate in a computer lab"; //6 terms
     String test2 = "computer in a computer lab"; //5 terms
@@ -275,20 +277,45 @@
         Integer freqInt = (Integer)test4Map.get(term);
         assertTrue(freqInt != null);
         assertTrue(freqInt.intValue() == freq);        
-      } 
+      }
+      SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+      knownSearcher.reader.getTermFreqVector(hits.id(1), mapper);
+      SortedSet vectorEntrySet = mapper.getTermVectorEntrySet();
+      assertTrue("mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
+      TermVectorEntry last = null;
+      for (Iterator iterator = vectorEntrySet.iterator(); iterator.hasNext();) {
+         TermVectorEntry tve = (TermVectorEntry) iterator.next();
+        if (tve != null && last != null)
+        {
+          assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency());
+          Integer expectedFreq = (Integer) test4Map.get(tve.getTerm());
+          //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
+          assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue());
+        }
+        last = tve;
+
+      }
+
+      FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+      knownSearcher.reader.getTermFreqVector(hits.id(1), fieldMapper);
+      Map map = fieldMapper.getFieldToTerms();
+      assertTrue("map Size: " + map.size() + " is not: " + 2, map.size() == 2);
+      vectorEntrySet = (SortedSet) map.get("field");
+      assertTrue("vectorEntrySet is null and it shouldn't be", vectorEntrySet != null);
+      assertTrue("vectorEntrySet Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
       knownSearcher.close();
     } catch (IOException e) {
       e.printStackTrace();
       assertTrue(false);
     }
-
-
   } 
   
   private void setupDoc(Document doc, String text)
   {
     doc.add(new Field("field", text, Field.Store.YES,
         Field.Index.TOKENIZED, Field.TermVector.YES));
+    doc.add(new Field("field2", text, Field.Store.YES,
+        Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
     //System.out.println("Document: " + doc);
   }