You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2022/06/30 13:51:51 UTC

[lucene-jira-archive] branch main updated: add attachments

This is an automated email from the ASF dual-hosted git repository.

tomoko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene-jira-archive.git


The following commit(s) were added to refs/heads/main by this push:
     new 54f19fd  add attachments
54f19fd is described below

commit 54f19fdec1103ea362301624c2bef42211e003a8
Author: Tomoko Uchida <to...@gmail.com>
AuthorDate: Thu Jun 30 22:44:29 2022 +0900

    add attachments
---
 .../Screen Shot 2022-06-29 at 11.02.35 AM.png      |  Bin 0 -> 227026 bytes
 .../LUCENE-10557/image-2022-06-29-13-36-57-365.png |  Bin 0 -> 151562 bytes
 attachments/LUCENE-10557/screenshot-1.png          |  Bin 0 -> 163280 bytes
 .../TokenSelectorAllWithParallelWriter.patch       | 1424 ++++++++++++++++++++
 attachments/LUCENE-602/TokenSelectorSoloAll.patch  |  374 +++++
 5 files changed, 1798 insertions(+)

diff --git a/attachments/LUCENE-10557/Screen Shot 2022-06-29 at 11.02.35 AM.png b/attachments/LUCENE-10557/Screen Shot 2022-06-29 at 11.02.35 AM.png
new file mode 100644
index 0000000..63ac13d
Binary files /dev/null and b/attachments/LUCENE-10557/Screen Shot 2022-06-29 at 11.02.35 AM.png differ
diff --git a/attachments/LUCENE-10557/image-2022-06-29-13-36-57-365.png b/attachments/LUCENE-10557/image-2022-06-29-13-36-57-365.png
new file mode 100644
index 0000000..b9b9df0
Binary files /dev/null and b/attachments/LUCENE-10557/image-2022-06-29-13-36-57-365.png differ
diff --git a/attachments/LUCENE-10557/screenshot-1.png b/attachments/LUCENE-10557/screenshot-1.png
new file mode 100644
index 0000000..d319cca
Binary files /dev/null and b/attachments/LUCENE-10557/screenshot-1.png differ
diff --git a/attachments/LUCENE-602/TokenSelectorAllWithParallelWriter.patch b/attachments/LUCENE-602/TokenSelectorAllWithParallelWriter.patch
new file mode 100644
index 0000000..ffa3a22
--- /dev/null
+++ b/attachments/LUCENE-602/TokenSelectorAllWithParallelWriter.patch
@@ -0,0 +1,1424 @@
+Index: common-build.xml
+===================================================================
+--- common-build.xml	(revision 414705)
++++ common-build.xml	(working copy)
+@@ -28,8 +28,8 @@
+ 
+   <property name="javac.deprecation" value="off"/>
+   <property name="javac.debug" value="on"/>
+-  <property name="javac.source" value="1.4"/>
+-  <property name="javac.target" value="1.4"/>
++  <property name="javac.source" value="1.5"/>
++  <property name="javac.target" value="1.5"/>
+ 
+   <property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? -->
+   <property name="build.encoding" value="utf-8"/>
+Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
+===================================================================
+--- src/test/org/apache/lucene/index/TestDocumentWriter.java	(revision 414705)
++++ src/test/org/apache/lucene/index/TestDocumentWriter.java	(working copy)
+@@ -16,11 +16,15 @@
+  * limitations under the License.
+  */
+ 
++import java.util.LinkedList;
++import java.util.List;
+ import junit.framework.TestCase;
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.WhitespaceAnalyzer;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.analysis.WhitespaceTokenizer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.*;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.RAMDirectory;
+@@ -54,6 +58,16 @@
+     Analyzer analyzer = new WhitespaceAnalyzer();
+     Similarity similarity = Similarity.getDefault();
+     DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
++    writer.setTermVectorTokenSelector(new TokenSelector(){
++      public boolean accept(String field, Token t) {
++        return Character.isLowerCase(t.termText().charAt(0));
++      }
++    });
++    writer.setPositionsTokenSelector(new TokenSelector(){
++      public boolean accept(String field, Token t) {
++        return Character.isLowerCase(t.termText().charAt(0));
++      }
++    });
+     String segName = "test";
+     writer.addDocument(segName, testDoc);
+     //After adding the document, we should be able to read it back in
+@@ -84,6 +98,31 @@
+     fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+     assertTrue(fields != null && fields.length == 1);
+     assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
++    
++    fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
++    assertTrue(fields != null && fields.length == 1);
++    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
++    assertTrue(fields[0].isTermVectorStored());
++    TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
++    assertTrue(tv != null);
++    String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
++    String[] tvwords = tv.getTerms();
++    List uniques = new LinkedList();
++    int omitted = 0;
++    for (int i=0; i<words.length; i++)
++      if (!uniques.contains(words[i])) {
++        uniques.add(words[i]);
++        if (!Character.isLowerCase(words[i].charAt(0)))
++          omitted++;
++      }
++    assertTrue(omitted!=0);
++    assertTrue(omitted!=uniques.size());
++    assertEquals(uniques.size()-omitted, tvwords.length);
++    for (int i=0; i<uniques.size(); i++) {
++      for (int j=0; j<tvwords.length; j++)
++        if (uniques.get(i).equals(tvwords[j]))
++          assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
++    }      
+ 
+     // test that the norm file is not present if omitNorms is true
+     for (int i = 0; i < reader.fieldInfos.size(); i++) {
+Index: src/test/org/apache/lucene/index/TestParallelWriter.java
+===================================================================
+--- src/test/org/apache/lucene/index/TestParallelWriter.java	(revision 0)
++++ src/test/org/apache/lucene/index/TestParallelWriter.java	(revision 0)
+@@ -0,0 +1,151 @@
++/*
++ * TestParallelWriter.java
++ * JUnit based test
++ *
++ * Created on April 30, 2006, 12:34 PM
++ */
++
++package org.apache.lucene.index;
++
++import java.util.Arrays;
++import junit.framework.*;
++import java.io.IOException;
++import java.io.PrintStream;
++import java.util.ArrayList;
++import java.util.Enumeration;
++import java.util.HashMap;
++import java.util.HashSet;
++import java.util.List;
++import java.util.Map;
++import java.util.Set;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.standard.StandardAnalyzer;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.document.Field;
++import org.apache.lucene.search.Hits;
++import org.apache.lucene.search.IndexSearcher;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.search.TermQuery;
++import org.apache.lucene.store.Directory;
++import org.apache.lucene.store.RAMDirectory;
++
++/**
++ *
++ * @author Chuck Williams
++ */
++public class TestParallelWriter extends TestCase {
++    
++    ParallelWriter writer;
++    Directory[] directories;
++    Map<Directory, List<String>> fieldDirectories = new HashMap<Directory, List<String>>();
++    ParallelReader reader;
++    IndexSearcher searcher;
++    
++    public TestParallelWriter(String testName) {
++        super(testName);
++    }
++
++    protected void setUp() throws Exception {
++        directories = new Directory[] { new RAMDirectory(), new RAMDirectory(), new RAMDirectory() };
++        fieldDirectories.put(directories[0], Arrays.asList("title", "body"));
++        fieldDirectories.put(directories[1], Arrays.asList("markup"));
++        fieldDirectories.put(directories[2], Arrays.asList("meta"));
++        
++        openWriter(true);
++        
++        Document doc1 = new Document();
++        doc1.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED));
++        doc1.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
++        doc1.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED));
++        writer.addDocument(doc1);
++        
++        Document doc2 = new Document();
++        doc2.add(new Field("title", "Galaxies", Field.Store.YES, Field.Index.TOKENIZED));
++        doc2.add(new Field("body", "Once upon a time in a galaxy far far away", Field.Store.NO, Field.Index.TOKENIZED));
++        doc2.add(new Field("meta", "Space", Field.Store.YES, Field.Index.UN_TOKENIZED));
++        writer.addDocument(doc2);
++        
++        closeWriter();
++        
++        openWriter(false);
++        openReader();
++    }
++    
++    private void openWriter(boolean create) throws IOException {
++        writer = new ParallelWriter(fieldDirectories, new StandardAnalyzer(), create);
++    }
++    
++    private void closeWriter() throws IOException {
++        writer.close();
++    }
++    
++    private void openReader() throws IOException {
++        reader = new ParallelReader();
++        for (Directory dir : directories)
++            reader.add(IndexReader.open(dir));
++        searcher = new IndexSearcher(reader);
++    }
++    
++    private void closeReader() throws IOException {
++        searcher.close();
++        reader.close();
++    }
++
++    protected void tearDown() throws Exception {
++        writer.close();
++        reader.close();
++        for (Directory dir : directories)
++            dir.close();
++    }
++
++    public static Test suite() {
++        TestSuite suite = new TestSuite(TestParallelWriter.class);
++        
++        return suite;
++    }
++
++    /**
++     * Test of addDocument method, of class org.apache.lucene.index.ParallelWriter.
++     */
++    public void test() throws Exception {
++        System.out.println("Test ParallelWriter");
++        
++        assertEquals(2, writer.docCount());
++        assertEquals(2, reader.numDocs());
++        
++        Hits hits = searcher.search(new TermQuery(new Term("title", "foxes")));
++        assertEquals(1, hits.length());
++        Document doc = hits.doc(0);
++        assertEquals("Animals", doc.get("meta"));
++        
++        hits = searcher.search(new TermQuery(new Term("body", "galaxy")));
++        assertEquals(1, hits.length());
++        doc = hits.doc(0);
++        assertEquals("Galaxies", doc.get("title"));
++        assertEquals("Space", doc.get("meta"));
++        
++        closeWriter();
++        reader.deleteDocuments(new Term("title", "foxes"));
++        closeReader();
++        
++        openWriter(false);
++        doc = new Document();
++        doc.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED));
++        doc.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
++        doc.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED));
++        doc.add(new Field("markup", "Interesting", Field.Store.YES, Field.Index.UN_TOKENIZED));
++        writer.addDocument(doc);
++        
++        closeWriter();
++        openWriter(false);
++        openReader();
++        
++        hits = searcher.search(new TermQuery(new Term("markup", "Interesting")));
++        assertEquals(1, hits.length());
++        doc = hits.doc(0);
++        assertEquals("Animals", doc.get("meta"));
++        assertEquals("Foxes", doc.get("title"));
++        assertEquals("Interesting", doc.get("markup"));
++    }
++
++}
+Index: src/java/org/apache/lucene/analysis/TokenSelector.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/TokenSelector.java	(revision 0)
++++ src/java/org/apache/lucene/analysis/TokenSelector.java	(revision 0)
+@@ -0,0 +1,24 @@
++/*
++ * TokenSelector.java
++ *
++ * Created on June 13, 2006, 12:18 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++/**
++ * An interface for selecting a subset of a token stream
++ *
++ * @author Chuck Wiliams
++ */
++public interface TokenSelector {
++    
++  /** Determine if a token should be selected
++   * @param fieldName field in which token was found
++   * @param token a token
++   * @return true iff token should be selected
++   */
++  public boolean accept(String fieldName, Token token);
++    
++}
+Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java	(revision 0)
++++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java	(revision 0)
+@@ -0,0 +1,44 @@
++/*
++ * PerFieldTokenSelectorWrapper.java
++ *
++ * Created on June 13, 2006, 4:09 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++import java.util.HashMap;
++import java.util.Map;
++
++/**
++ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
++ *
++ * @author Chuck Williams
++ */
++public class PerFieldTokenSelectorWrapper implements TokenSelector {
++  
++  private Map selectors = new HashMap();
++  private TokenSelector defaultSelector;
++  
++  /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
++  public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
++    this.defaultSelector = defaultSelector;
++  }
++  
++  /** Add a token selector for the named field */
++  public void addSelector(String fieldName, TokenSelector selector) {
++    selectors.put(fieldName, selector);
++  }
++  
++  /** Determine if token is accepted by fieldName */
++  public boolean accept(String fieldName, Token token) {
++    TokenSelector selector = (TokenSelector) selectors.get(fieldName);
++    if (selector!=null)
++        return selector.accept(fieldName, token);
++    else if (defaultSelector!=null)
++        return defaultSelector.accept(fieldName, token);
++    else
++        return true;
++  }
++    
++}
+\ No newline at end of file
+Index: src/java/org/apache/lucene/index/Writable.java.orig
+===================================================================
+--- src/java/org/apache/lucene/index/Writable.java.orig	(revision 0)
++++ src/java/org/apache/lucene/index/Writable.java.orig	(revision 0)
+@@ -0,0 +1,248 @@
++/*
++ * Writable.java
++ *
++ * Created on April 28, 2006, 6:10 PM
++ *
++ */
++
++package org.apache.lucene.index;
++
++import java.io.IOException;
++import java.io.PrintStream;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.store.Directory;
++
++/**
++ * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter)
++ *
++ * @author Chuck Williams
++ */
++public interface Writable {
++    
++    /**
++     * Adds a document to this index.  If the document contains more than
++     * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++     * discarded.
++     */
++    public void addDocument(Document doc) throws IOException;
++
++    /**
++     * Adds a document to this index, using the provided analyzer instead of the
++     * value of {@link #getAnalyzer()}.  If the document contains more than
++     * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++     * discarded.
++     */
++    public void addDocument(Document doc, Analyzer analyzer) throws IOException;
++
++    /**
++     * Returns the number of documents currently in this index.
++     */
++    public int docCount();
++
++    /**
++     * Merges all segments together into a single segment, optimizing an index
++     *      for search.
++     */
++    public void optimize() throws IOException;
++    
++    /**
++     * Flushes all changes to an index and closes all associated files.
++     */
++    public void close() throws IOException;
++
++    /**
++     * Returns the analyzer used by this index.
++     */
++    public Analyzer getAnalyzer();
++
++    
++    /**
++     * Setting to turn on usage of a compound file. When on, multiple files
++     *  for each segment are merged into a single file once the segment creation
++     *  is finished. This is done regardless of what directory is in use.
++     */
++    public void setUseCompoundFile(boolean value);
++
++    /**
++     * Get the current setting of whether to use the compound file format.
++     *  Note that this just returns the value you set with setUseCompoundFile(boolean)
++     *  or the default. You cannot use this to query the status of an existing index.
++     * 
++     * @see #setUseCompoundFile(boolean)
++     */
++    public boolean getUseCompoundFile();
++
++    /**
++     * Expert: Set the Similarity implementation used by this IndexWriter.
++     * 
++     * @see Similarity#setDefault(Similarity)
++     */
++    public void setSimilarity(Similarity similarity);
++
++    /**
++     * Expert: Return the Similarity implementation used by this IndexWriter.
++     * 
++     * <p>This defaults to the current value of {@link Similarity#getDefault()}.
++     */
++    public Similarity getSimilarity();
++
++    /**
++     * Expert: Set the interval between indexed terms.  Large values cause less
++     * memory to be used by IndexReader, but slow random-access to terms.  Small
++     * values cause more memory to be used by an IndexReader, and speed
++     * random-access to terms.
++     * 
++     * This parameter determines the amount of computation required per query
++     * term, regardless of the number of documents that contain that term.  In
++     * particular, it is the maximum number of other terms that must be
++     * scanned before a term is located and its frequency and position information
++     * may be processed.  In a large index with user-entered query terms, query
++     * processing time is likely to be dominated not by term lookup but rather
++     * by the processing of frequency and positional data.  In a small index
++     * or when many uncommon query terms are generated (e.g., by wildcard
++     * queries) term lookup may become a dominant cost.
++     * 
++     * In particular, <code>numUniqueTerms/interval</code> terms are read into
++     * memory by an IndexReader, and, on average, <code>interval/2</code> terms
++     * must be scanned for each random term access.
++     * 
++     * @see #DEFAULT_TERM_INDEX_INTERVAL
++     */
++    public void setTermIndexInterval(int interval);
++
++    /**
++     * Expert: Return the interval between indexed terms.
++     * 
++     * @see #setTermIndexInterval(int)
++     */
++    public int getTermIndexInterval();
++
++    /**
++     * Determines the minimal number of documents required before the buffered
++     * in-memory documents are merging and a new Segment is created.
++     * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
++     * large value gives faster indexing.  At the same time, mergeFactor limits
++     * the number of files open in a FSDirectory.
++     * 
++     * <p> The default value is 10.
++     * 
++     * 
++     * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
++     */
++    public void setMaxBufferedDocs(int maxBufferedDocs);
++
++    /**
++     * 
++     * 
++     * @see #setMaxBufferedDocs
++     */
++    public int getMaxBufferedDocs();
++
++    /**
++     * The maximum number of terms that will be indexed for a single field in a
++     * document.  This limits the amount of memory required for indexing, so that
++     * collections with very large files will not crash the indexing process by
++     * running out of memory.<p/>
++     * Note that this effectively truncates large documents, excluding from the
++     * index terms that occur further in the document.  If you know your source
++     * documents are large, be sure to set this value high enough to accomodate
++     * the expected size.  If you set it to Integer.MAX_VALUE, then the only limit
++     * is your memory, but you should anticipate an OutOfMemoryError.<p/>
++     * By default, no more than 10,000 terms will be indexed for a field.
++     */
++    public void setMaxFieldLength(int maxFieldLength);
++
++    /**
++     * 
++     * 
++     * @see #setMaxFieldLength
++     */
++    public int getMaxFieldLength();
++
++    /**
++     * Determines the largest number of documents ever merged by addDocument().
++     * Small values (e.g., less than 10,000) are best for interactive indexing,
++     * as this limits the length of pauses while indexing to a few seconds.
++     * Larger values are best for batched indexing and speedier searches.
++     * 
++     * <p>The default value is {@link Integer#MAX_VALUE}.
++     */
++    public void setMaxMergeDocs(int maxMergeDocs);
++
++    /**
++     * 
++     * 
++     * @see #setMaxMergeDocs
++     */
++    public int getMaxMergeDocs();
++
++    /**
++     * Determines how often segment indices are merged by addDocument().  With
++     * smaller values, less RAM is used while indexing, and searches on
++     * unoptimized indices are faster, but indexing speed is slower.  With larger
++     * values, more RAM is used during indexing, and while searches on unoptimized
++     * indices are slower, indexing is faster.  Thus larger values (> 10) are best
++     * for batch index creation, and smaller values (< 10) for indices that are
++     * interactively maintained.
++     * 
++     * <p>This must never be less than 2.  The default value is 10.
++     */
++    public void setMergeFactor(int mergeFactor);
++
++    /**
++     * 
++     * 
++     * @see #setMergeFactor
++     */
++    public int getMergeFactor();
++
++    /**
++     * Sets the maximum time to wait for a write lock (in milliseconds).
++     */
++    public void setWriteLockTimeout(long writeLockTimeout);
++
++    /**
++     * 
++     * 
++     * @see #setWriteLockTimeout
++     */
++    public long getWriteLockTimeout();
++
++    /**
++     * Sets the maximum time to wait for a commit lock (in milliseconds).
++     */
++    public void setCommitLockTimeout(long commitLockTimeout);
++
++    /**
++     * 
++     * 
++     * @see #setCommitLockTimeout
++     */
++    public long getCommitLockTimeout();
++    
++    /** Expert:  Set the TokenSelector used to determine subset of tokens stored in term vectors.
++     * @param selector the term vector TokenSelector
++     */
++    public void setTermVectorTokenSelector(TokenSelector selector);
++    
++    /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++     * @return the TokenSelector used to determine term vector tokens
++     */
++    public TokenSelector getTermVectorTokenSelector();
++    
++    /** If non-null, information about merges and a message when
++     * maxFieldLength is reached will be printed to this.
++     */
++    public void setInfoStream(PrintStream infoStream);
++
++    /**
++     * 
++     * 
++     * @see #setInfoStream
++     */
++    public PrintStream getInfoStream();
++
++}
+Index: src/java/org/apache/lucene/index/IndexWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/IndexWriter.java	(revision 414705)
++++ src/java/org/apache/lucene/index/IndexWriter.java	(working copy)
+@@ -17,6 +17,7 @@
+  */
+ 
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.Document;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.Directory;
+@@ -56,7 +57,7 @@
+   @see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion
+   */
+ 
+-public class IndexWriter {
++public class IndexWriter implements Writable {
+ 
+   /**
+    * Default value for the write lock timeout (1,000).
+@@ -100,8 +101,10 @@
+    */
+   public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
+   
+-  private Directory directory;  // where this index resides
+-  private Analyzer analyzer;    // how to analyze text
++  private Directory directory;                      // where this index resides
++  private Analyzer analyzer;                        // how to analyze text
++  private TokenSelector termVectorTokenSelector;    // subset of token stream stored in term vectors
++  private TokenSelector positionsTokenSelector;     // subset of token stream for which positions are stored
+ 
+   private Similarity similarity = Similarity.getDefault(); // how to normalize
+ 
+@@ -153,6 +156,38 @@
+     return this.similarity;
+   }
+ 
++  /** Expert:  Set the TokenSelector used to determine subset of tokens stored in term vectors.
++   * @param selector the term vector TokenSelector
++   */
++  public void setTermVectorTokenSelector(TokenSelector selector) {
++    this.termVectorTokenSelector = selector;
++  }
++  
++  /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++   * @return the TokenSelector used to determine term vector tokens
++   */
++  public TokenSelector getTermVectorTokenSelector() {
++    return termVectorTokenSelector;
++  }
++
++  /** Expert:  Set the TokenSelector used to determine subset of tokens for which positions are stored.
++   *           (At least one position is always stored for each term in each doc to ensure the term stays in
++   *            the index so long as any docs reference it)
++   * @param selector the positions TokenSelector
++   */
++  public void setPositionsTokenSelector(TokenSelector selector) {
++    this.positionsTokenSelector = selector;
++  }
++  
++  /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
++   *          (At least one position is always stored for each term in each doc to ensure the term stays in
++   *           the index so long as any docs reference it)
++   * @return the positions TokenSelector
++   */
++  public TokenSelector getPositionsTokenSelector() {
++    return positionsTokenSelector;
++  }
++
+   /** Expert: Set the interval between indexed terms.  Large values cause less
+    * memory to be used by IndexReader, but slow random-access to terms.  Small
+    * values cause more memory to be used by an IndexReader, and speed
+@@ -471,6 +506,8 @@
+   public void addDocument(Document doc, Analyzer analyzer) throws IOException {
+     DocumentWriter dw =
+       new DocumentWriter(ramDirectory, analyzer, this);
++    dw.setTermVectorTokenSelector(termVectorTokenSelector);
++    dw.setPositionsTokenSelector(positionsTokenSelector);
+     dw.setInfoStream(infoStream);
+     String segmentName = newSegmentName();
+     dw.addDocument(segmentName, doc);
+Index: src/java/org/apache/lucene/index/Writable.java
+===================================================================
+--- src/java/org/apache/lucene/index/Writable.java	(revision 0)
++++ src/java/org/apache/lucene/index/Writable.java	(revision 0)
+@@ -0,0 +1,262 @@
++/*
++ * Writable.java
++ *
++ * Created on April 28, 2006, 6:10 PM
++ *
++ */
++
++package org.apache.lucene.index;
++
++import java.io.IOException;
++import java.io.PrintStream;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.store.Directory;
++
++/**
++ * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter)
++ *
++ * @author Chuck Williams
++ */
++public interface Writable {
++    
++    /**
++     * Adds a document to this index.  If the document contains more than
++     * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++     * discarded.
++     */
++    public void addDocument(Document doc) throws IOException;
++
++    /**
++     * Adds a document to this index, using the provided analyzer instead of the
++     * value of {@link #getAnalyzer()}.  If the document contains more than
++     * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++     * discarded.
++     */
++    public void addDocument(Document doc, Analyzer analyzer) throws IOException;
++
++    /**
++     * Returns the number of documents currently in this index.
++     */
++    public int docCount();
++
++    /**
++     * Merges all segments together into a single segment, optimizing an index
++     *      for search.
++     */
++    public void optimize() throws IOException;
++    
++    /**
++     * Flushes all changes to an index and closes all associated files.
++     */
++    public void close() throws IOException;
++
++    /**
++     * Returns the analyzer used by this index.
++     */
++    public Analyzer getAnalyzer();
++
++    
++    /**
++     * Setting to turn on usage of a compound file. When on, multiple files
++     *  for each segment are merged into a single file once the segment creation
++     *  is finished. This is done regardless of what directory is in use.
++     */
++    public void setUseCompoundFile(boolean value);
++
++    /**
++     * Get the current setting of whether to use the compound file format.
++     *  Note that this just returns the value you set with setUseCompoundFile(boolean)
++     *  or the default. You cannot use this to query the status of an existing index.
++     * 
++     * @see #setUseCompoundFile(boolean)
++     */
++    public boolean getUseCompoundFile();
++
++    /**
++     * Expert: Set the Similarity implementation used by this IndexWriter.
++     * 
++     * @see Similarity#setDefault(Similarity)
++     */
++    public void setSimilarity(Similarity similarity);
++
++    /**
++     * Expert: Return the Similarity implementation used by this IndexWriter.
++     * 
++     * <p>This defaults to the current value of {@link Similarity#getDefault()}.
++     */
++    public Similarity getSimilarity();
++
++    /**
++     * Expert: Set the interval between indexed terms.  Large values cause less
++     * memory to be used by IndexReader, but slow random-access to terms.  Small
++     * values cause more memory to be used by an IndexReader, and speed
++     * random-access to terms.
++     * 
++     * This parameter determines the amount of computation required per query
++     * term, regardless of the number of documents that contain that term.  In
++     * particular, it is the maximum number of other terms that must be
++     * scanned before a term is located and its frequency and position information
++     * may be processed.  In a large index with user-entered query terms, query
++     * processing time is likely to be dominated not by term lookup but rather
++     * by the processing of frequency and positional data.  In a small index
++     * or when many uncommon query terms are generated (e.g., by wildcard
++     * queries) term lookup may become a dominant cost.
++     * 
++     * In particular, <code>numUniqueTerms/interval</code> terms are read into
++     * memory by an IndexReader, and, on average, <code>interval/2</code> terms
++     * must be scanned for each random term access.
++     * 
++     * @see #DEFAULT_TERM_INDEX_INTERVAL
++     */
++    public void setTermIndexInterval(int interval);
++
++    /**
++     * Expert: Return the interval between indexed terms.
++     * 
++     * @see #setTermIndexInterval(int)
++     */
++    public int getTermIndexInterval();
++
++    /**
++     * Determines the minimal number of documents required before the buffered
++     * in-memory documents are merging and a new Segment is created.
++     * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
++     * large value gives faster indexing.  At the same time, mergeFactor limits
++     * the number of files open in a FSDirectory.
++     * 
++     * <p> The default value is 10.
++     * 
++     * 
++     * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
++     */
++    public void setMaxBufferedDocs(int maxBufferedDocs);
++
++    /**
++     * 
++     * 
++     * @see #setMaxBufferedDocs
++     */
++    public int getMaxBufferedDocs();
++
++    /**
++     * The maximum number of terms that will be indexed for a single field in a
++     * document.  This limits the amount of memory required for indexing, so that
++     * collections with very large files will not crash the indexing process by
++     * running out of memory.<p/>
++     * Note that this effectively truncates large documents, excluding from the
++     * index terms that occur further in the document.  If you know your source
++     * documents are large, be sure to set this value high enough to accomodate
++     * the expected size.  If you set it to Integer.MAX_VALUE, then the only limit
++     * is your memory, but you should anticipate an OutOfMemoryError.<p/>
++     * By default, no more than 10,000 terms will be indexed for a field.
++     */
++    public void setMaxFieldLength(int maxFieldLength);
++
++    /**
++     * 
++     * 
++     * @see #setMaxFieldLength
++     */
++    public int getMaxFieldLength();
++
++    /**
++     * Determines the largest number of documents ever merged by addDocument().
++     * Small values (e.g., less than 10,000) are best for interactive indexing,
++     * as this limits the length of pauses while indexing to a few seconds.
++     * Larger values are best for batched indexing and speedier searches.
++     * 
++     * <p>The default value is {@link Integer#MAX_VALUE}.
++     */
++    public void setMaxMergeDocs(int maxMergeDocs);
++
++    /**
++     * 
++     * 
++     * @see #setMaxMergeDocs
++     */
++    public int getMaxMergeDocs();
++
++    /**
++     * Determines how often segment indices are merged by addDocument().  With
++     * smaller values, less RAM is used while indexing, and searches on
++     * unoptimized indices are faster, but indexing speed is slower.  With larger
++     * values, more RAM is used during indexing, and while searches on unoptimized
++     * indices are slower, indexing is faster.  Thus larger values (> 10) are best
++     * for batch index creation, and smaller values (< 10) for indices that are
++     * interactively maintained.
++     * 
++     * <p>This must never be less than 2.  The default value is 10.
++     */
++    public void setMergeFactor(int mergeFactor);
++
++    /**
++     * 
++     * 
++     * @see #setMergeFactor
++     */
++    public int getMergeFactor();
++
++    /**
++     * Sets the maximum time to wait for a write lock (in milliseconds).
++     */
++    public void setWriteLockTimeout(long writeLockTimeout);
++
++    /**
++     * 
++     * 
++     * @see #setWriteLockTimeout
++     */
++    public long getWriteLockTimeout();
++
++    /**
++     * Sets the maximum time to wait for a commit lock (in milliseconds).
++     */
++    public void setCommitLockTimeout(long commitLockTimeout);
++
++    /**
++     * 
++     * 
++     * @see #setCommitLockTimeout
++     */
++    public long getCommitLockTimeout();
++    
++    /** Expert:  Set the TokenSelector used to determine subset of tokens stored in term vectors.
++     * @param selector the term vector TokenSelector
++     */
++    public void setTermVectorTokenSelector(TokenSelector selector);
++    
++    /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++     * @return the TokenSelector used to determine term vector tokens
++     */
++    public TokenSelector getTermVectorTokenSelector();
++    
++    /** Expert:  Set the TokenSelector used to determine subset of tokens for which positions are stored.
++     *           (At least one position is always stored for each term in each doc to ensure the term stays in
++     *            the index so long as any docs reference it)
++     * @param selector the positions TokenSelector
++     */
++    public void setPositionsTokenSelector(TokenSelector selector);
++  
++    /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
++     *          (At least one position is always stored for each term in each doc to ensure the term stays in
++     *           the index so long as any docs reference it)
++     * @return the positions TokenSelector
++     */
++    public TokenSelector getPositionsTokenSelector();
++
++    /** If non-null, information about merges and a message when
++     * maxFieldLength is reached will be printed to this.
++     */
++    public void setInfoStream(PrintStream infoStream);
++
++    /**
++     * 
++     * 
++     * @see #setInfoStream
++     */
++    public PrintStream getInfoStream();
++
++}
+Index: src/java/org/apache/lucene/index/DocumentWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/DocumentWriter.java	(revision 414705)
++++ src/java/org/apache/lucene/index/DocumentWriter.java	(working copy)
+@@ -17,6 +17,7 @@
+  */
+ 
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.document.Document;
+@@ -35,6 +36,8 @@
+ 
+ final class DocumentWriter {
+   private Analyzer analyzer;
++  private TokenSelector termVectorTokenSelector;
++  private TokenSelector positionsTokenSelector;
+   private Directory directory;
+   private Similarity similarity;
+   private FieldInfos fieldInfos;
+@@ -142,9 +145,9 @@
+         if (!field.isTokenized()) {		  // un-tokenized field
+           String stringValue = field.stringValue();
+           if(field.isStoreOffsetWithTermVector())
+-            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
++            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
+           else
+-            addPosition(fieldName, stringValue, position++, null);
++            addPosition(fieldName, stringValue, position++, null, false, false);
+           offset += stringValue.length();
+           length++;
+         } else 
+@@ -165,10 +168,16 @@
+             for (Token t = stream.next(); t != null; t = stream.next()) {
+               position += (t.getPositionIncrement() - 1);
+               
+-              if(field.isStoreOffsetWithTermVector())
+-                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
+-              else
+-                addPosition(fieldName, t.termText(), position++, null);
++              boolean omittv = false, omitpos = false;
++              if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
++                  omittv  = true;
++              if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
++                  omitpos = true;
++              
++              addPosition(fieldName, t.termText(), position++,
++                          field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
++                                                                         : null,
++                          omittv, omitpos);
+               
+               lastToken = t;
+               if (++length > maxFieldLength) {
+@@ -196,20 +205,24 @@
+ 
+   private final Term termBuffer = new Term("", ""); // avoid consing
+ 
+-  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
++  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
++                                 boolean omitFromTermVector, boolean omitPosition) {
+     termBuffer.set(field, text);
+     //System.out.println("Offset: " + offset);
+     Posting ti = (Posting) postingTable.get(termBuffer);
+     if (ti != null) {				  // word seen before
+       int freq = ti.freq;
+-      if (ti.positions.length == freq) {	  // positions array is full
+-        int[] newPositions = new int[freq * 2];	  // double size
+-        int[] positions = ti.positions;
+-        for (int i = 0; i < freq; i++)		  // copy old positions to new
+-          newPositions[i] = positions[i];
+-        ti.positions = newPositions;
++      
++      if (!omitPosition) {
++        if (ti.positions.length == freq) {        // positions array is full
++          int[] newPositions = new int[freq * 2]; // double size
++          int[] positions = ti.positions;
++          for (int i = 0; i < freq; i++)          // copy old positions to new
++            newPositions[i] = positions[i];
++          ti.positions = newPositions;
++        }
++        ti.positions[freq] = position;            // add new position
+       }
+-      ti.positions[freq] = position;		  // add new position
+ 
+       if (offset != null) {
+         if (ti.offsets.length == freq){
+@@ -223,10 +236,12 @@
+         }
+         ti.offsets[freq] = offset;
+       }
+-      ti.freq = freq + 1;			  // update frequency
+-    } else {					  // word not seen before
++      
++      if (!omitPosition)
++        ti.freq = freq + 1;                       // update frequency
++    } else {                                      // word not seen before
+       Term term = new Term(field, text, false);
+-      postingTable.put(term, new Posting(term, position, offset));
++      postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
+     }
+   }
+ 
+@@ -351,7 +366,7 @@
+             termVectorWriter.closeField();
+           }
+         }
+-        if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
++        if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
+             termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
+         }
+       }
+@@ -390,6 +405,16 @@
+     this.infoStream = infoStream;
+   }
+ 
++  /** If non-null, this will be used to select which tokens are stored in term vectors */
++  void setTermVectorTokenSelector(TokenSelector selector) {
++    this.termVectorTokenSelector = selector;
++  }
++
++  /** If non-null, this will be used to select which tokens have positions stored in the index. */
++  void setPositionsTokenSelector(TokenSelector selector) {
++    this.positionsTokenSelector = selector;
++  }
++
+ }
+ 
+ final class Posting {				  // info about a Term in a doc
+@@ -397,17 +422,17 @@
+   int freq;					  // its frequency in doc
+   int[] positions;				  // positions it occurs at
+   TermVectorOffsetInfo [] offsets;
++  boolean omitFromTermVector;                     // if true, omit from term vector
+ 
+-  Posting(Term t, int position, TermVectorOffsetInfo offset) {
++  Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
+     term = t;
+     freq = 1;
+     positions = new int[1];
+     positions[0] = position;
+-    if(offset != null){
+-    offsets = new TermVectorOffsetInfo[1];
+-    offsets[0] = offset;
++    if(offset != null) {
++      offsets = new TermVectorOffsetInfo[1];
++      offsets[0] = offset;
+     }
+-    else
+-      offsets = null;
++    this.omitFromTermVector = omitFromTermVector;
+   }
+ }
+Index: src/java/org/apache/lucene/index/ParallelWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/ParallelWriter.java	(revision 0)
++++ src/java/org/apache/lucene/index/ParallelWriter.java	(revision 0)
+@@ -0,0 +1,345 @@
++/*
++ * ParallelWriter.java
++ *
++ * Created on April 28, 2006, 7:07 PM
++ *
++ */
++
++package org.apache.lucene.index;
++
++import java.io.IOException;
++import java.io.PrintStream;
++import java.util.Enumeration;
++import java.util.HashMap;
++import java.util.List;
++import java.util.Map;
++import java.util.concurrent.CountDownLatch;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.document.Field;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.store.Directory;
++
++/**
++ * ParallelWriter is a companion to ParallelReader, although as with IndexWriter it only supports indexes stored in a Directory.
++ * The interface is at the field level.  A map from directories to lists of fields is provided to create the ParallelWriter,
++ * which then creates an IndexWriter for each specified directory and operates on each field of a document using the IndexWriter
++ * for the directory to which that field is mapped.  This mapping allows an application to configure its use of parallel sub-
++ * indexes independently from the rest of its processing.
++ *
++ * This implementation single-threads calls to addDocument(), but does the sub-document writes in parallel. Users of this class
++ * must ensure that the ParallelReader is never reopened while adding a new document, and must deal with recovery if exceptions
++ * occur while adding a document.
++ *
++ * @author Chuck Williams
++ */
++public class ParallelWriter implements Writable {
++    
++    IndexWriter[] writers;                // All IndexWriters
++    IOException exception;                // If any writer gets an exception, this is stored here (only one needed)
++    Map<String,IndexWriter> writerMap;    // Field name --> IndexWriter that stores that field
++    IndexWriter oneWriter;                // An arbitrarily chosen IndexWriter -- used to get config info which is the same for all IndexWriters
++    Analyzer analyzer;                    // The Analyzer applied to all tokenized field content
++    
++    private static final Document EMPTY_DOCUMENT = new Document();  // Empty document used to sync doc id's when a document is added without fields for all indexes
++    
++    /**
++     * Create a new ParallelWriter
++     * 
++     * @param directoryFieldsMap specifies the directory to use to store each field, multiple directories creating parallel indexes
++     * @param analyzer applied to all tokenized field content
++     * @param create create new indexes in directories iff true
++     * @throws IOException if the IndexWriters cannot be created
++     */
++    public ParallelWriter(Map<Directory,List<String>> directoryFieldsMap, Analyzer analyzer, boolean create) throws IOException {
++        this.analyzer = analyzer;
++        writers = new IndexWriter[directoryFieldsMap.size()];
++        writerMap = new HashMap<String,IndexWriter>(directoryFieldsMap.size()*5/3);
++        int i=0;
++        for (Map.Entry<Directory,List<String>> entry : directoryFieldsMap.entrySet()) {
++            IndexWriter writer = new IndexWriter(entry.getKey(), analyzer, create);
++            writers[i++] = oneWriter = writer;
++            for (String field : entry.getValue())
++                writerMap.put(field, writer);
++        }
++    }
++    
++    /** Invert a directoryFieldsMap
++     * @param directoryFieldsMap a map for directories to lists of fields they contain
++     * @return a map from each field to its directory
++     */
++    public static Map<String, Directory> invertDirectoryFieldsMap(Map<Directory,List<String>> directoryFieldsMap) {
++        Map<String, Directory> fieldDirectoryMap = new HashMap<String, Directory>();
++        for (Map.Entry<Directory, List<String>> entry : directoryFieldsMap.entrySet())
++            for (String field : entry.getValue())
++                fieldDirectoryMap.put(field, entry.getKey());
++        return fieldDirectoryMap;
++    }
++
++    /** Add document to this index by adding subdocuments with the mapped fields for each parallel index.  This method is synchronized because the
++     *  the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document.
++     *  This synchronization could have a negative effect on batch indexing performance.  Users of this method must ensure that the ParllelReader
++     *  is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync.
++     * @param doc the document to add
++     * @throws IOException if there are problems writing the indexes.  <strong>WARNING:  If this happens it is bad.</string>  The doc-id's in the
++     *                     indexes are likely out of sync.  This situation requires repair to resync the doc ids in each document set.  Possible
++     *                     repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then
++     *                     optimizing to restore equal doc ids.
++     * @throws RuntimeException if the threads writing to the sub-indexes are interrupted.
++     */
++    public void addDocument(Document doc) throws IOException {
++        addDocument(doc, analyzer);
++    }
++
++    /** Add document to this index by adding subdocuments with the mapped fields for each parallel index.  This method is synchronized because the
++     *  the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document.
++     *  This synchronization could have a negative effect on batch indexing performance.  Users of this method must ensure that the ParllelReader
++     *  is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync.
++     * @param doc the document to add
++     * @param analyzer apply special analyzer to this document rather than the one for the index (discouraged -- use addDocument(doc))
++     * @throws IOException if there are problems writing the indexes.  <strong>WARNING:  If this happens it is bad.</string>  The doc-id's in the
++     *                     indexes are likely out of sync.  This situation requires repair to resync the doc ids in each document set.  Possible
++     *                     repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then
++     *                     optimizing to restore equal doc ids.
++     * @throws RuntimeException if the threads writing to the sub-indexes are interrupted.
++     */
++    public synchronized void addDocument(Document doc, Analyzer analyzer) throws IOException {
++        Map<IndexWriter,Document> documentMap = new HashMap<IndexWriter,Document>(writers.length*5/3);
++        Enumeration<Field> fields = doc.fields();
++        while (fields.hasMoreElements()) {
++            Field field = fields.nextElement();
++            IndexWriter writer = writerMap.get(field.name());
++            if (writer==null)
++                throw new RuntimeException(new UnknownFieldException("Unregistered field:  " + field.name()));
++            Document subdoc = documentMap.get(writer);
++            if (subdoc==null)
++                documentMap.put(writer, subdoc = new Document());
++            subdoc.add(field);
++        }
++        CountDownLatch latch = new CountDownLatch(writers.length);
++        exception = null;
++        for (IndexWriter writer : writers) {
++            Document subdoc = documentMap.get(writer);
++            if (subdoc==null)      // Must have a document in each parallel index to sync doc id's
++                subdoc = EMPTY_DOCUMENT;
++            new Thread(new WriterWorker(writer, subdoc, latch)).run();
++        }
++        try {
++            latch.await();
++        } catch (InterruptedException e) {
++            throw new RuntimeException("Interrupted while writing subdocuments!", e);
++        }
++        if (exception != null)
++            throw exception;
++    }
++    
++    // Write a sub-documents to a sub-index and record any exception
++    private class WriterWorker implements Runnable {
++        
++        private IndexWriter writer;
++        private Document document;
++        private CountDownLatch latch;
++        
++        private WriterWorker(IndexWriter writer, Document document, CountDownLatch latch) {
++            this.writer = writer;
++            this.document = document;
++            this.latch = latch;
++        }
++        
++        public void run() {
++            try {
++                writer.addDocument(document);
++            } catch (IOException e) {
++                exception = e;
++            } finally {
++                latch.countDown();
++            }
++        }
++        
++    }
++    
++    /** Obtain the number of document in this index, which is the same for each parallel index. */
++    public int docCount() {
++        return oneWriter.docCount();
++    }
++
++    /** Optimize all parallel indexes.  This is synchronized to keep all index doc-id's synced up */
++    public synchronized void optimize() throws IOException {
++        for (IndexWriter writer : writers)
++            writer.optimize();
++    }
++
++    /** Close all parallel indexes.  Note that the provided directories are not closed. Synchronized. */
++    public synchronized void close() throws IOException {
++        for (IndexWriter writer : writers)
++            writer.close();
++    }
++
++    /** Getter for analyzer provided to the constructor */
++    public Analyzer getAnalyzer() {
++        return analyzer;
++    }
++
++    /** Set whether or not to use compound file format in every parallel index */
++    public void setUseCompoundFile(boolean value) {
++        for (IndexWriter writer : writers)
++            writer.setUseCompoundFile(value);
++    }
++
++    /** Get the compound file usage decision, same for every parallel index */
++    public boolean getUseCompoundFile() {
++        return oneWriter.getUseCompoundFile();
++    }
++
++    /** Set similarity to use for every parallel index */
++    public void setSimilarity(Similarity similarity) {
++        for (IndexWriter writer : writers)
++            writer.setSimilarity(similarity);
++    }
++
++    /** Get similarity, which is used by every parallel index */
++    public Similarity getSimilarity() {
++        return oneWriter.getSimilarity();
++    }
++
++    /** Set the termIndexInterval used for every parallel index */
++    public void setTermIndexInterval(int interval) {
++        for (IndexWriter writer : writers)
++            writer.setTermIndexInterval(interval);
++    }
++
++    /** Get the termIndexInterval, which is used by every parallel index */
++    public int getTermIndexInterval() {
++        return oneWriter.getTermIndexInterval();
++    }
++
++    /** Set maxBufferedDocs for every parallel index */
++    public void setMaxBufferedDocs(int maxBufferedDocs) {
++        for (IndexWriter writer : writers)
++            writer.setMaxBufferedDocs(maxBufferedDocs);
++    }
++
++    /** get maxBufferedDocs, same for every parallel index */
++    public int getMaxBufferedDocs() {
++        return oneWriter.getMaxBufferedDocs();
++    }
++
++    /** Set maxFieldLength to use for every parallel index */
++    public void setMaxFieldLength(int maxFieldLength) {
++        for (IndexWriter writer : writers)
++            writer.setMaxFieldLength(maxFieldLength);
++    }
++
++    /** Get maxFieldLength, same for every parallel index */
++    public int getMaxFieldLength() {
++        return oneWriter.getMaxFieldLength();
++    }
++
++    /** Set maxMergeDocs for every parallel index */
++    public void setMaxMergeDocs(int maxMergeDocs) {
++        for (IndexWriter writer : writers)
++            writer.setMaxMergeDocs(maxMergeDocs);
++    }
++
++    /** Get max merge docs, same for every parallel index */
++    public int getMaxMergeDocs() {
++        return oneWriter.getMaxMergeDocs();
++    }
++
++    /** Set merge factor for every parallel index */
++    public void setMergeFactor(int mergeFactor) {
++        for (IndexWriter writer : writers)
++            writer.setMergeFactor(mergeFactor);
++    }
++
++    /** Get merge factor, same for every parallel index */
++    public int getMergeFactor() {
++        return oneWriter.getMergeFactor();
++    }
++
++    /** Set write lock timeout (millis) for every parallel index */
++    public void setWriteLockTimeout(long writeLockTimeout) {
++        for (IndexWriter writer : writers)
++            writer.setWriteLockTimeout(writeLockTimeout);
++    }
++
++    /** Get write lock timeout, same for every parallel index */
++    public long getWriteLockTimeout() {
++        return oneWriter.getWriteLockTimeout();
++    }
++
++    /** Set commit lock timeout for every parallel index */
++    public void setCommitLockTimeout(long commitLockTimeout) {
++        for (IndexWriter writer : writers)
++            writer.setCommitLockTimeout(commitLockTimeout);
++    }
++
++    /** Get commit lock timeout, same for every parallel index */
++    public long getCommitLockTimeout() {
++        return oneWriter.getCommitLockTimeout();
++    }
++
++    /** Get term vector TokenSelector, same for every parallel index */
++    public void setTermVectorTokenSelector(TokenSelector selector) {
++        for (IndexWriter writer : writers)
++            writer.setTermVectorTokenSelector(selector);
++    }
++
++    /** Set term vector TokenSelector for every parallel index */
++    public TokenSelector getTermVectorTokenSelector() {
++        return oneWriter.getTermVectorTokenSelector();
++    }
++
++    /** Set positions TokenSelector for every parallel index */
++    public void setPositionsTokenSelector(TokenSelector selector) {
++        for (IndexWriter writer : writers)
++            writer.setPositionsTokenSelector(selector);
++    }
++  
++    /** Get positions TokenSelector, same for every parallel index */
++    public TokenSelector getPositionsTokenSelector() {
++        return oneWriter.getPositionsTokenSelector();
++    }
++
++    /** Unsupported.  use setInfoStream(field, infoStream) */
++    public void setInfoStream(PrintStream infoStream) {
++        throw new UnsupportedOperationException();
++    }
++
++    /** Set an info stream for the the IndexWriter managing a specified field.  The info stream receives information about field truncations, merges, etc.
++     * @param field the field whose writer to assign the info stream to
++     * @param infoStream the info stream
++     * @throws UnknownFieldException if field has not been associated with an IndexWriter in this index
++     */
++    public void setInfoStream(String field, PrintStream infoStream) throws UnknownFieldException {
++        IndexWriter writer = writerMap.get(field);
++        if (writer==null)
++            throw new UnknownFieldException("Unregistered field:  " + field);
++        writer.setInfoStream(infoStream);
++        
++    }
++
++    /** Unsupported.  use getInfoStream(field) */
++    public PrintStream getInfoStream() {
++        throw new UnsupportedOperationException();
++    }
++    
++    /** Getter for info stream associated with field.  See setInfoStream(field, infoStream).
++     */
++    public PrintStream getInfoStream(String field, PrintStream infoStream) throws UnknownFieldException {
++        IndexWriter writer = writerMap.get(field);
++        if (writer==null)
++            throw new UnknownFieldException("Unregistered field:  " + field);
++        return writer.getInfoStream();
++    }
++
++    private static class UnknownFieldException extends Exception {
++        
++        private UnknownFieldException(String message) {
++            super(message);
++        }
++        
++    }
++    
++}
diff --git a/attachments/LUCENE-602/TokenSelectorSoloAll.patch b/attachments/LUCENE-602/TokenSelectorSoloAll.patch
new file mode 100644
index 0000000..2818b50
--- /dev/null
+++ b/attachments/LUCENE-602/TokenSelectorSoloAll.patch
@@ -0,0 +1,374 @@
+Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
+===================================================================
+--- src/test/org/apache/lucene/index/TestDocumentWriter.java	(revision 414705)
++++ src/test/org/apache/lucene/index/TestDocumentWriter.java	(working copy)
+@@ -16,11 +16,15 @@
+  * limitations under the License.
+  */
+ 
++import java.util.LinkedList;
++import java.util.List;
+ import junit.framework.TestCase;
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.WhitespaceAnalyzer;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.analysis.WhitespaceTokenizer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.*;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.RAMDirectory;
+@@ -54,6 +58,16 @@
+     Analyzer analyzer = new WhitespaceAnalyzer();
+     Similarity similarity = Similarity.getDefault();
+     DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
++    writer.setTermVectorTokenSelector(new TokenSelector(){
++      public boolean accept(String field, Token t) {
++        return Character.isLowerCase(t.termText().charAt(0));
++      }
++    });
++    writer.setPositionsTokenSelector(new TokenSelector(){
++      public boolean accept(String field, Token t) {
++        return Character.isLowerCase(t.termText().charAt(0));
++      }
++    });
+     String segName = "test";
+     writer.addDocument(segName, testDoc);
+     //After adding the document, we should be able to read it back in
+@@ -84,6 +98,31 @@
+     fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+     assertTrue(fields != null && fields.length == 1);
+     assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
++    
++    fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
++    assertTrue(fields != null && fields.length == 1);
++    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
++    assertTrue(fields[0].isTermVectorStored());
++    TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
++    assertTrue(tv != null);
++    String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
++    String[] tvwords = tv.getTerms();
++    List uniques = new LinkedList();
++    int omitted = 0;
++    for (int i=0; i<words.length; i++)
++      if (!uniques.contains(words[i])) {
++        uniques.add(words[i]);
++        if (!Character.isLowerCase(words[i].charAt(0)))
++          omitted++;
++      }
++    assertTrue(omitted!=0);
++    assertTrue(omitted!=uniques.size());
++    assertEquals(uniques.size()-omitted, tvwords.length);
++    for (int i=0; i<uniques.size(); i++) {
++      for (int j=0; j<tvwords.length; j++)
++        if (uniques.get(i).equals(tvwords[j]))
++          assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
++    }      
+ 
+     // test that the norm file is not present if omitNorms is true
+     for (int i = 0; i < reader.fieldInfos.size(); i++) {
+Index: src/java/org/apache/lucene/analysis/TokenSelector.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/TokenSelector.java	(revision 0)
++++ src/java/org/apache/lucene/analysis/TokenSelector.java	(revision 0)
+@@ -0,0 +1,24 @@
++/*
++ * TokenSelector.java
++ *
++ * Created on June 13, 2006, 12:18 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++/**
++ * An interface for selecting a subset of a token stream
++ *
++ * @author Chuck Wiliams
++ */
++public interface TokenSelector {
++    
++  /** Determine if a token should be selected
++   * @param fieldName field in which token was found
++   * @param token a token
++   * @return true iff token should be selected
++   */
++  public boolean accept(String fieldName, Token token);
++    
++}
+Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java	(revision 0)
++++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java	(revision 0)
+@@ -0,0 +1,44 @@
++/*
++ * PerFieldTokenSelectorWrapper.java
++ *
++ * Created on June 13, 2006, 4:09 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++import java.util.HashMap;
++import java.util.Map;
++
++/**
++ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
++ *
++ * @author Chuck Williams
++ */
++public class PerFieldTokenSelectorWrapper implements TokenSelector {
++  
++  private Map selectors = new HashMap();
++  private TokenSelector defaultSelector;
++  
++  /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
++  public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
++    this.defaultSelector = defaultSelector;
++  }
++  
++  /** Add a token selector for the named field */
++  public void addSelector(String fieldName, TokenSelector selector) {
++    selectors.put(fieldName, selector);
++  }
++  
++  /** Determine if token is accepted by fieldName */
++  public boolean accept(String fieldName, Token token) {
++    TokenSelector selector = (TokenSelector) selectors.get(fieldName);
++    if (selector!=null)
++        return selector.accept(fieldName, token);
++    else if (defaultSelector!=null)
++        return defaultSelector.accept(fieldName, token);
++    else
++        return true;
++  }
++    
++}
+\ No newline at end of file
+Index: src/java/org/apache/lucene/index/IndexWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/IndexWriter.java	(revision 414705)
++++ src/java/org/apache/lucene/index/IndexWriter.java	(working copy)
+@@ -17,6 +17,7 @@
+  */
+ 
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.Document;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.Directory;
+@@ -100,8 +101,10 @@
+    */
+   public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
+   
+-  private Directory directory;  // where this index resides
+-  private Analyzer analyzer;    // how to analyze text
++  private Directory directory;                      // where this index resides
++  private Analyzer analyzer;                        // how to analyze text
++  private TokenSelector termVectorTokenSelector;    // subset of token stream stored in term vectors
++  private TokenSelector positionsTokenSelector;     // subset of token stream for which positions are stored
+ 
+   private Similarity similarity = Similarity.getDefault(); // how to normalize
+ 
+@@ -153,6 +156,38 @@
+     return this.similarity;
+   }
+ 
++  /** Expert:  Set the TokenSelector used to determine subset of tokens stored in term vectors.
++   * @param selector the term vector TokenSelector
++   */
++  public void setTermVectorTokenSelector(TokenSelector selector) {
++    this.termVectorTokenSelector = selector;
++  }
++  
++  /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++   * @return the TokenSelector used to determine term vector tokens
++   */
++  public TokenSelector getTermVectorTokenSelector() {
++    return termVectorTokenSelector;
++  }
++
++  /** Expert:  Set the TokenSelector used to determine subset of tokens for which positions are stored.
++   *           (At least one position is always stored for each term in each doc to ensure the term stays in
++   *            the index so long as any docs reference it)
++   * @param selector the positions TokenSelector
++   */
++  public void setPositionsTokenSelector(TokenSelector selector) {
++    this.positionsTokenSelector = selector;
++  }
++  
++  /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
++   *          (At least one position is always stored for each term in each doc to ensure the term stays in
++   *           the index so long as any docs reference it)
++   * @return the positions TokenSelector
++   */
++  public TokenSelector getPositionsTokenSelector() {
++    return positionsTokenSelector;
++  }
++
+   /** Expert: Set the interval between indexed terms.  Large values cause less
+    * memory to be used by IndexReader, but slow random-access to terms.  Small
+    * values cause more memory to be used by an IndexReader, and speed
+@@ -471,6 +506,8 @@
+   public void addDocument(Document doc, Analyzer analyzer) throws IOException {
+     DocumentWriter dw =
+       new DocumentWriter(ramDirectory, analyzer, this);
++    dw.setTermVectorTokenSelector(termVectorTokenSelector);
++    dw.setPositionsTokenSelector(positionsTokenSelector);
+     dw.setInfoStream(infoStream);
+     String segmentName = newSegmentName();
+     dw.addDocument(segmentName, doc);
+Index: src/java/org/apache/lucene/index/DocumentWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/DocumentWriter.java	(revision 414705)
++++ src/java/org/apache/lucene/index/DocumentWriter.java	(working copy)
+@@ -17,6 +17,7 @@
+  */
+ 
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.document.Document;
+@@ -35,6 +36,8 @@
+ 
+ final class DocumentWriter {
+   private Analyzer analyzer;
++  private TokenSelector termVectorTokenSelector;
++  private TokenSelector positionsTokenSelector;
+   private Directory directory;
+   private Similarity similarity;
+   private FieldInfos fieldInfos;
+@@ -142,9 +145,9 @@
+         if (!field.isTokenized()) {		  // un-tokenized field
+           String stringValue = field.stringValue();
+           if(field.isStoreOffsetWithTermVector())
+-            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
++            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
+           else
+-            addPosition(fieldName, stringValue, position++, null);
++            addPosition(fieldName, stringValue, position++, null, false, false);
+           offset += stringValue.length();
+           length++;
+         } else 
+@@ -165,10 +168,16 @@
+             for (Token t = stream.next(); t != null; t = stream.next()) {
+               position += (t.getPositionIncrement() - 1);
+               
+-              if(field.isStoreOffsetWithTermVector())
+-                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
+-              else
+-                addPosition(fieldName, t.termText(), position++, null);
++              boolean omittv = false, omitpos = false;
++              if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
++                  omittv  = true;
++              if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
++                  omitpos = true;
++              
++              addPosition(fieldName, t.termText(), position++,
++                          field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
++                                                                         : null,
++                          omittv, omitpos);
+               
+               lastToken = t;
+               if (++length > maxFieldLength) {
+@@ -196,20 +205,24 @@
+ 
+   private final Term termBuffer = new Term("", ""); // avoid consing
+ 
+-  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
++  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
++                                 boolean omitFromTermVector, boolean omitPosition) {
+     termBuffer.set(field, text);
+     //System.out.println("Offset: " + offset);
+     Posting ti = (Posting) postingTable.get(termBuffer);
+     if (ti != null) {				  // word seen before
+       int freq = ti.freq;
+-      if (ti.positions.length == freq) {	  // positions array is full
+-        int[] newPositions = new int[freq * 2];	  // double size
+-        int[] positions = ti.positions;
+-        for (int i = 0; i < freq; i++)		  // copy old positions to new
+-          newPositions[i] = positions[i];
+-        ti.positions = newPositions;
++      
++      if (!omitPosition) {
++        if (ti.positions.length == freq) {        // positions array is full
++          int[] newPositions = new int[freq * 2]; // double size
++          int[] positions = ti.positions;
++          for (int i = 0; i < freq; i++)          // copy old positions to new
++            newPositions[i] = positions[i];
++          ti.positions = newPositions;
++        }
++        ti.positions[freq] = position;            // add new position
+       }
+-      ti.positions[freq] = position;		  // add new position
+ 
+       if (offset != null) {
+         if (ti.offsets.length == freq){
+@@ -223,10 +236,12 @@
+         }
+         ti.offsets[freq] = offset;
+       }
+-      ti.freq = freq + 1;			  // update frequency
+-    } else {					  // word not seen before
++      
++      if (!omitPosition)
++        ti.freq = freq + 1;                       // update frequency
++    } else {                                      // word not seen before
+       Term term = new Term(field, text, false);
+-      postingTable.put(term, new Posting(term, position, offset));
++      postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
+     }
+   }
+ 
+@@ -351,7 +366,7 @@
+             termVectorWriter.closeField();
+           }
+         }
+-        if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
++        if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
+             termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
+         }
+       }
+@@ -390,6 +405,16 @@
+     this.infoStream = infoStream;
+   }
+ 
++  /** If non-null, this will be used to select which tokens are stored in term vectors */
++  void setTermVectorTokenSelector(TokenSelector selector) {
++    this.termVectorTokenSelector = selector;
++  }
++
++  /** If non-null, this will be used to select which tokens have positions stored in the index. */
++  void setPositionsTokenSelector(TokenSelector selector) {
++    this.positionsTokenSelector = selector;
++  }
++
+ }
+ 
+ final class Posting {				  // info about a Term in a doc
+@@ -397,17 +422,17 @@
+   int freq;					  // its frequency in doc
+   int[] positions;				  // positions it occurs at
+   TermVectorOffsetInfo [] offsets;
++  boolean omitFromTermVector;                     // if true, omit from term vector
+ 
+-  Posting(Term t, int position, TermVectorOffsetInfo offset) {
++  Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
+     term = t;
+     freq = 1;
+     positions = new int[1];
+     positions[0] = position;
+-    if(offset != null){
+-    offsets = new TermVectorOffsetInfo[1];
+-    offsets[0] = offset;
++    if(offset != null) {
++      offsets = new TermVectorOffsetInfo[1];
++      offsets[0] = offset;
+     }
+-    else
+-      offsets = null;
++    this.omitFromTermVector = omitFromTermVector;
+   }
+ }