You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2022/06/30 13:51:51 UTC
[lucene-jira-archive] branch main updated: add attachments
This is an automated email from the ASF dual-hosted git repository.
tomoko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene-jira-archive.git
The following commit(s) were added to refs/heads/main by this push:
new 54f19fd add attachments
54f19fd is described below
commit 54f19fdec1103ea362301624c2bef42211e003a8
Author: Tomoko Uchida <to...@gmail.com>
AuthorDate: Thu Jun 30 22:44:29 2022 +0900
add attachments
---
.../Screen Shot 2022-06-29 at 11.02.35 AM.png | Bin 0 -> 227026 bytes
.../LUCENE-10557/image-2022-06-29-13-36-57-365.png | Bin 0 -> 151562 bytes
attachments/LUCENE-10557/screenshot-1.png | Bin 0 -> 163280 bytes
.../TokenSelectorAllWithParallelWriter.patch | 1424 ++++++++++++++++++++
attachments/LUCENE-602/TokenSelectorSoloAll.patch | 374 +++++
5 files changed, 1798 insertions(+)
diff --git a/attachments/LUCENE-10557/Screen Shot 2022-06-29 at 11.02.35 AM.png b/attachments/LUCENE-10557/Screen Shot 2022-06-29 at 11.02.35 AM.png
new file mode 100644
index 0000000..63ac13d
Binary files /dev/null and b/attachments/LUCENE-10557/Screen Shot 2022-06-29 at 11.02.35 AM.png differ
diff --git a/attachments/LUCENE-10557/image-2022-06-29-13-36-57-365.png b/attachments/LUCENE-10557/image-2022-06-29-13-36-57-365.png
new file mode 100644
index 0000000..b9b9df0
Binary files /dev/null and b/attachments/LUCENE-10557/image-2022-06-29-13-36-57-365.png differ
diff --git a/attachments/LUCENE-10557/screenshot-1.png b/attachments/LUCENE-10557/screenshot-1.png
new file mode 100644
index 0000000..d319cca
Binary files /dev/null and b/attachments/LUCENE-10557/screenshot-1.png differ
diff --git a/attachments/LUCENE-602/TokenSelectorAllWithParallelWriter.patch b/attachments/LUCENE-602/TokenSelectorAllWithParallelWriter.patch
new file mode 100644
index 0000000..ffa3a22
--- /dev/null
+++ b/attachments/LUCENE-602/TokenSelectorAllWithParallelWriter.patch
@@ -0,0 +1,1424 @@
+Index: common-build.xml
+===================================================================
+--- common-build.xml (revision 414705)
++++ common-build.xml (working copy)
+@@ -28,8 +28,8 @@
+
+ <property name="javac.deprecation" value="off"/>
+ <property name="javac.debug" value="on"/>
+- <property name="javac.source" value="1.4"/>
+- <property name="javac.target" value="1.4"/>
++ <property name="javac.source" value="1.5"/>
++ <property name="javac.target" value="1.5"/>
+
+ <property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? -->
+ <property name="build.encoding" value="utf-8"/>
+Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
+===================================================================
+--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 414705)
++++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
+@@ -16,11 +16,15 @@
+ * limitations under the License.
+ */
+
++import java.util.LinkedList;
++import java.util.List;
+ import junit.framework.TestCase;
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.WhitespaceAnalyzer;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.analysis.WhitespaceTokenizer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.*;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.RAMDirectory;
+@@ -54,6 +58,16 @@
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ Similarity similarity = Similarity.getDefault();
+ DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
++ writer.setTermVectorTokenSelector(new TokenSelector(){
++ public boolean accept(String field, Token t) {
++ return Character.isLowerCase(t.termText().charAt(0));
++ }
++ });
++ writer.setPositionsTokenSelector(new TokenSelector(){
++ public boolean accept(String field, Token t) {
++ return Character.isLowerCase(t.termText().charAt(0));
++ }
++ });
+ String segName = "test";
+ writer.addDocument(segName, testDoc);
+ //After adding the document, we should be able to read it back in
+@@ -84,6 +98,31 @@
+ fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
++
++ fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
++ assertTrue(fields != null && fields.length == 1);
++ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
++ assertTrue(fields[0].isTermVectorStored());
++ TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
++ assertTrue(tv != null);
++ String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
++ String[] tvwords = tv.getTerms();
++ List uniques = new LinkedList();
++ int omitted = 0;
++ for (int i=0; i<words.length; i++)
++ if (!uniques.contains(words[i])) {
++ uniques.add(words[i]);
++ if (!Character.isLowerCase(words[i].charAt(0)))
++ omitted++;
++ }
++ assertTrue(omitted!=0);
++ assertTrue(omitted!=uniques.size());
++ assertEquals(uniques.size()-omitted, tvwords.length);
++ for (int i=0; i<uniques.size(); i++) {
++ for (int j=0; j<tvwords.length; j++)
++ if (uniques.get(i).equals(tvwords[j]))
++ assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
++ }
+
+ // test that the norm file is not present if omitNorms is true
+ for (int i = 0; i < reader.fieldInfos.size(); i++) {
+Index: src/test/org/apache/lucene/index/TestParallelWriter.java
+===================================================================
+--- src/test/org/apache/lucene/index/TestParallelWriter.java (revision 0)
++++ src/test/org/apache/lucene/index/TestParallelWriter.java (revision 0)
+@@ -0,0 +1,151 @@
++/*
++ * TestParallelWriter.java
++ * JUnit based test
++ *
++ * Created on April 30, 2006, 12:34 PM
++ */
++
++package org.apache.lucene.index;
++
++import java.util.Arrays;
++import junit.framework.*;
++import java.io.IOException;
++import java.io.PrintStream;
++import java.util.ArrayList;
++import java.util.Enumeration;
++import java.util.HashMap;
++import java.util.HashSet;
++import java.util.List;
++import java.util.Map;
++import java.util.Set;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.standard.StandardAnalyzer;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.document.Field;
++import org.apache.lucene.search.Hits;
++import org.apache.lucene.search.IndexSearcher;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.search.TermQuery;
++import org.apache.lucene.store.Directory;
++import org.apache.lucene.store.RAMDirectory;
++
++/**
++ *
++ * @author Chuck Williams
++ */
++public class TestParallelWriter extends TestCase {
++
++ ParallelWriter writer;
++ Directory[] directories;
++ Map<Directory, List<String>> fieldDirectories = new HashMap<Directory, List<String>>();
++ ParallelReader reader;
++ IndexSearcher searcher;
++
++ public TestParallelWriter(String testName) {
++ super(testName);
++ }
++
++ protected void setUp() throws Exception {
++ directories = new Directory[] { new RAMDirectory(), new RAMDirectory(), new RAMDirectory() };
++ fieldDirectories.put(directories[0], Arrays.asList("title", "body"));
++ fieldDirectories.put(directories[1], Arrays.asList("markup"));
++ fieldDirectories.put(directories[2], Arrays.asList("meta"));
++
++ openWriter(true);
++
++ Document doc1 = new Document();
++ doc1.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED));
++ doc1.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
++ doc1.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED));
++ writer.addDocument(doc1);
++
++ Document doc2 = new Document();
++ doc2.add(new Field("title", "Galaxies", Field.Store.YES, Field.Index.TOKENIZED));
++ doc2.add(new Field("body", "Once upon a time in a galaxy far far away", Field.Store.NO, Field.Index.TOKENIZED));
++ doc2.add(new Field("meta", "Space", Field.Store.YES, Field.Index.UN_TOKENIZED));
++ writer.addDocument(doc2);
++
++ closeWriter();
++
++ openWriter(false);
++ openReader();
++ }
++
++ private void openWriter(boolean create) throws IOException {
++ writer = new ParallelWriter(fieldDirectories, new StandardAnalyzer(), create);
++ }
++
++ private void closeWriter() throws IOException {
++ writer.close();
++ }
++
++ private void openReader() throws IOException {
++ reader = new ParallelReader();
++ for (Directory dir : directories)
++ reader.add(IndexReader.open(dir));
++ searcher = new IndexSearcher(reader);
++ }
++
++ private void closeReader() throws IOException {
++ searcher.close();
++ reader.close();
++ }
++
++ protected void tearDown() throws Exception {
++ writer.close();
++ reader.close();
++ for (Directory dir : directories)
++ dir.close();
++ }
++
++ public static Test suite() {
++ TestSuite suite = new TestSuite(TestParallelWriter.class);
++
++ return suite;
++ }
++
++ /**
++ * Test of addDocument method, of class org.apache.lucene.index.ParallelWriter.
++ */
++ public void test() throws Exception {
++ System.out.println("Test ParallelWriter");
++
++ assertEquals(2, writer.docCount());
++ assertEquals(2, reader.numDocs());
++
++ Hits hits = searcher.search(new TermQuery(new Term("title", "foxes")));
++ assertEquals(1, hits.length());
++ Document doc = hits.doc(0);
++ assertEquals("Animals", doc.get("meta"));
++
++ hits = searcher.search(new TermQuery(new Term("body", "galaxy")));
++ assertEquals(1, hits.length());
++ doc = hits.doc(0);
++ assertEquals("Galaxies", doc.get("title"));
++ assertEquals("Space", doc.get("meta"));
++
++ closeWriter();
++ reader.deleteDocuments(new Term("title", "foxes"));
++ closeReader();
++
++ openWriter(false);
++ doc = new Document();
++ doc.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED));
++ doc.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
++ doc.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED));
++ doc.add(new Field("markup", "Interesting", Field.Store.YES, Field.Index.UN_TOKENIZED));
++ writer.addDocument(doc);
++
++ closeWriter();
++ openWriter(false);
++ openReader();
++
++ hits = searcher.search(new TermQuery(new Term("markup", "Interesting")));
++ assertEquals(1, hits.length());
++ doc = hits.doc(0);
++ assertEquals("Animals", doc.get("meta"));
++ assertEquals("Foxes", doc.get("title"));
++ assertEquals("Interesting", doc.get("markup"));
++ }
++
++}
+Index: src/java/org/apache/lucene/analysis/TokenSelector.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
++++ src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
+@@ -0,0 +1,24 @@
++/*
++ * TokenSelector.java
++ *
++ * Created on June 13, 2006, 12:18 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++/**
++ * An interface for selecting a subset of a token stream
++ *
++ * @author Chuck Wiliams
++ */
++public interface TokenSelector {
++
++ /** Determine if a token should be selected
++ * @param fieldName field in which token was found
++ * @param token a token
++ * @return true iff token should be selected
++ */
++ public boolean accept(String fieldName, Token token);
++
++}
+Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
++++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
+@@ -0,0 +1,44 @@
++/*
++ * PerFieldTokenSelectorWrapper.java
++ *
++ * Created on June 13, 2006, 4:09 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++import java.util.HashMap;
++import java.util.Map;
++
++/**
++ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
++ *
++ * @author Chuck Williams
++ */
++public class PerFieldTokenSelectorWrapper implements TokenSelector {
++
++ private Map selectors = new HashMap();
++ private TokenSelector defaultSelector;
++
++ /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
++ public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
++ this.defaultSelector = defaultSelector;
++ }
++
++ /** Add a token selector for the named field */
++ public void addSelector(String fieldName, TokenSelector selector) {
++ selectors.put(fieldName, selector);
++ }
++
++ /** Determine if token is accepted by fieldName */
++ public boolean accept(String fieldName, Token token) {
++ TokenSelector selector = (TokenSelector) selectors.get(fieldName);
++ if (selector!=null)
++ return selector.accept(fieldName, token);
++ else if (defaultSelector!=null)
++ return defaultSelector.accept(fieldName, token);
++ else
++ return true;
++ }
++
++}
+\ No newline at end of file
+Index: src/java/org/apache/lucene/index/Writable.java.orig
+===================================================================
+--- src/java/org/apache/lucene/index/Writable.java.orig (revision 0)
++++ src/java/org/apache/lucene/index/Writable.java.orig (revision 0)
+@@ -0,0 +1,248 @@
++/*
++ * Writable.java
++ *
++ * Created on April 28, 2006, 6:10 PM
++ *
++ */
++
++package org.apache.lucene.index;
++
++import java.io.IOException;
++import java.io.PrintStream;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.store.Directory;
++
++/**
++ * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter)
++ *
++ * @author Chuck Williams
++ */
++public interface Writable {
++
++ /**
++ * Adds a document to this index. If the document contains more than
++ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++ * discarded.
++ */
++ public void addDocument(Document doc) throws IOException;
++
++ /**
++ * Adds a document to this index, using the provided analyzer instead of the
++ * value of {@link #getAnalyzer()}. If the document contains more than
++ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++ * discarded.
++ */
++ public void addDocument(Document doc, Analyzer analyzer) throws IOException;
++
++ /**
++ * Returns the number of documents currently in this index.
++ */
++ public int docCount();
++
++ /**
++ * Merges all segments together into a single segment, optimizing an index
++ * for search.
++ */
++ public void optimize() throws IOException;
++
++ /**
++ * Flushes all changes to an index and closes all associated files.
++ */
++ public void close() throws IOException;
++
++ /**
++ * Returns the analyzer used by this index.
++ */
++ public Analyzer getAnalyzer();
++
++
++ /**
++ * Setting to turn on usage of a compound file. When on, multiple files
++ * for each segment are merged into a single file once the segment creation
++ * is finished. This is done regardless of what directory is in use.
++ */
++ public void setUseCompoundFile(boolean value);
++
++ /**
++ * Get the current setting of whether to use the compound file format.
++ * Note that this just returns the value you set with setUseCompoundFile(boolean)
++ * or the default. You cannot use this to query the status of an existing index.
++ *
++ * @see #setUseCompoundFile(boolean)
++ */
++ public boolean getUseCompoundFile();
++
++ /**
++ * Expert: Set the Similarity implementation used by this IndexWriter.
++ *
++ * @see Similarity#setDefault(Similarity)
++ */
++ public void setSimilarity(Similarity similarity);
++
++ /**
++ * Expert: Return the Similarity implementation used by this IndexWriter.
++ *
++ * <p>This defaults to the current value of {@link Similarity#getDefault()}.
++ */
++ public Similarity getSimilarity();
++
++ /**
++ * Expert: Set the interval between indexed terms. Large values cause less
++ * memory to be used by IndexReader, but slow random-access to terms. Small
++ * values cause more memory to be used by an IndexReader, and speed
++ * random-access to terms.
++ *
++ * This parameter determines the amount of computation required per query
++ * term, regardless of the number of documents that contain that term. In
++ * particular, it is the maximum number of other terms that must be
++ * scanned before a term is located and its frequency and position information
++ * may be processed. In a large index with user-entered query terms, query
++ * processing time is likely to be dominated not by term lookup but rather
++ * by the processing of frequency and positional data. In a small index
++ * or when many uncommon query terms are generated (e.g., by wildcard
++ * queries) term lookup may become a dominant cost.
++ *
++ * In particular, <code>numUniqueTerms/interval</code> terms are read into
++ * memory by an IndexReader, and, on average, <code>interval/2</code> terms
++ * must be scanned for each random term access.
++ *
++ * @see #DEFAULT_TERM_INDEX_INTERVAL
++ */
++ public void setTermIndexInterval(int interval);
++
++ /**
++ * Expert: Return the interval between indexed terms.
++ *
++ * @see #setTermIndexInterval(int)
++ */
++ public int getTermIndexInterval();
++
++ /**
++ * Determines the minimal number of documents required before the buffered
++ * in-memory documents are merging and a new Segment is created.
++ * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
++ * large value gives faster indexing. At the same time, mergeFactor limits
++ * the number of files open in a FSDirectory.
++ *
++ * <p> The default value is 10.
++ *
++ *
++ * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
++ */
++ public void setMaxBufferedDocs(int maxBufferedDocs);
++
++ /**
++ *
++ *
++ * @see #setMaxBufferedDocs
++ */
++ public int getMaxBufferedDocs();
++
++ /**
++ * The maximum number of terms that will be indexed for a single field in a
++ * document. This limits the amount of memory required for indexing, so that
++ * collections with very large files will not crash the indexing process by
++ * running out of memory.<p/>
++ * Note that this effectively truncates large documents, excluding from the
++ * index terms that occur further in the document. If you know your source
++ * documents are large, be sure to set this value high enough to accomodate
++ * the expected size. If you set it to Integer.MAX_VALUE, then the only limit
++ * is your memory, but you should anticipate an OutOfMemoryError.<p/>
++ * By default, no more than 10,000 terms will be indexed for a field.
++ */
++ public void setMaxFieldLength(int maxFieldLength);
++
++ /**
++ *
++ *
++ * @see #setMaxFieldLength
++ */
++ public int getMaxFieldLength();
++
++ /**
++ * Determines the largest number of documents ever merged by addDocument().
++ * Small values (e.g., less than 10,000) are best for interactive indexing,
++ * as this limits the length of pauses while indexing to a few seconds.
++ * Larger values are best for batched indexing and speedier searches.
++ *
++ * <p>The default value is {@link Integer#MAX_VALUE}.
++ */
++ public void setMaxMergeDocs(int maxMergeDocs);
++
++ /**
++ *
++ *
++ * @see #setMaxMergeDocs
++ */
++ public int getMaxMergeDocs();
++
++ /**
++ * Determines how often segment indices are merged by addDocument(). With
++ * smaller values, less RAM is used while indexing, and searches on
++ * unoptimized indices are faster, but indexing speed is slower. With larger
++ * values, more RAM is used during indexing, and while searches on unoptimized
++ * indices are slower, indexing is faster. Thus larger values (> 10) are best
++ * for batch index creation, and smaller values (< 10) for indices that are
++ * interactively maintained.
++ *
++ * <p>This must never be less than 2. The default value is 10.
++ */
++ public void setMergeFactor(int mergeFactor);
++
++ /**
++ *
++ *
++ * @see #setMergeFactor
++ */
++ public int getMergeFactor();
++
++ /**
++ * Sets the maximum time to wait for a write lock (in milliseconds).
++ */
++ public void setWriteLockTimeout(long writeLockTimeout);
++
++ /**
++ *
++ *
++ * @see #setWriteLockTimeout
++ */
++ public long getWriteLockTimeout();
++
++ /**
++ * Sets the maximum time to wait for a commit lock (in milliseconds).
++ */
++ public void setCommitLockTimeout(long commitLockTimeout);
++
++ /**
++ *
++ *
++ * @see #setCommitLockTimeout
++ */
++ public long getCommitLockTimeout();
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @param selector the term vector TokenSelector
++ */
++ public void setTermVectorTokenSelector(TokenSelector selector);
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @return the TokenSelector used to determine term vector tokens
++ */
++ public TokenSelector getTermVectorTokenSelector();
++
++ /** If non-null, information about merges and a message when
++ * maxFieldLength is reached will be printed to this.
++ */
++ public void setInfoStream(PrintStream infoStream);
++
++ /**
++ *
++ *
++ * @see #setInfoStream
++ */
++ public PrintStream getInfoStream();
++
++}
+Index: src/java/org/apache/lucene/index/IndexWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/IndexWriter.java (revision 414705)
++++ src/java/org/apache/lucene/index/IndexWriter.java (working copy)
+@@ -17,6 +17,7 @@
+ */
+
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.Document;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.Directory;
+@@ -56,7 +57,7 @@
+ @see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion
+ */
+
+-public class IndexWriter {
++public class IndexWriter implements Writable {
+
+ /**
+ * Default value for the write lock timeout (1,000).
+@@ -100,8 +101,10 @@
+ */
+ public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
+
+- private Directory directory; // where this index resides
+- private Analyzer analyzer; // how to analyze text
++ private Directory directory; // where this index resides
++ private Analyzer analyzer; // how to analyze text
++ private TokenSelector termVectorTokenSelector; // subset of token stream stored in term vectors
++ private TokenSelector positionsTokenSelector; // subset of token stream for which positions are stored
+
+ private Similarity similarity = Similarity.getDefault(); // how to normalize
+
+@@ -153,6 +156,38 @@
+ return this.similarity;
+ }
+
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @param selector the term vector TokenSelector
++ */
++ public void setTermVectorTokenSelector(TokenSelector selector) {
++ this.termVectorTokenSelector = selector;
++ }
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @return the TokenSelector used to determine term vector tokens
++ */
++ public TokenSelector getTermVectorTokenSelector() {
++ return termVectorTokenSelector;
++ }
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored.
++ * (At least one position is always stored for each term in each doc to ensure the term stays in
++ * the index so long as any docs reference it)
++ * @param selector the positions TokenSelector
++ */
++ public void setPositionsTokenSelector(TokenSelector selector) {
++ this.positionsTokenSelector = selector;
++ }
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
++ * (At least one position is always stored for each term in each doc to ensure the term stays in
++ * the index so long as any docs reference it)
++ * @return the positions TokenSelector
++ */
++ public TokenSelector getPositionsTokenSelector() {
++ return positionsTokenSelector;
++ }
++
+ /** Expert: Set the interval between indexed terms. Large values cause less
+ * memory to be used by IndexReader, but slow random-access to terms. Small
+ * values cause more memory to be used by an IndexReader, and speed
+@@ -471,6 +506,8 @@
+ public void addDocument(Document doc, Analyzer analyzer) throws IOException {
+ DocumentWriter dw =
+ new DocumentWriter(ramDirectory, analyzer, this);
++ dw.setTermVectorTokenSelector(termVectorTokenSelector);
++ dw.setPositionsTokenSelector(positionsTokenSelector);
+ dw.setInfoStream(infoStream);
+ String segmentName = newSegmentName();
+ dw.addDocument(segmentName, doc);
+Index: src/java/org/apache/lucene/index/Writable.java
+===================================================================
+--- src/java/org/apache/lucene/index/Writable.java (revision 0)
++++ src/java/org/apache/lucene/index/Writable.java (revision 0)
+@@ -0,0 +1,262 @@
++/*
++ * Writable.java
++ *
++ * Created on April 28, 2006, 6:10 PM
++ *
++ */
++
++package org.apache.lucene.index;
++
++import java.io.IOException;
++import java.io.PrintStream;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.store.Directory;
++
++/**
++ * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter)
++ *
++ * @author Chuck Williams
++ */
++public interface Writable {
++
++ /**
++ * Adds a document to this index. If the document contains more than
++ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++ * discarded.
++ */
++ public void addDocument(Document doc) throws IOException;
++
++ /**
++ * Adds a document to this index, using the provided analyzer instead of the
++ * value of {@link #getAnalyzer()}. If the document contains more than
++ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
++ * discarded.
++ */
++ public void addDocument(Document doc, Analyzer analyzer) throws IOException;
++
++ /**
++ * Returns the number of documents currently in this index.
++ */
++ public int docCount();
++
++ /**
++ * Merges all segments together into a single segment, optimizing an index
++ * for search.
++ */
++ public void optimize() throws IOException;
++
++ /**
++ * Flushes all changes to an index and closes all associated files.
++ */
++ public void close() throws IOException;
++
++ /**
++ * Returns the analyzer used by this index.
++ */
++ public Analyzer getAnalyzer();
++
++
++ /**
++ * Setting to turn on usage of a compound file. When on, multiple files
++ * for each segment are merged into a single file once the segment creation
++ * is finished. This is done regardless of what directory is in use.
++ */
++ public void setUseCompoundFile(boolean value);
++
++ /**
++ * Get the current setting of whether to use the compound file format.
++ * Note that this just returns the value you set with setUseCompoundFile(boolean)
++ * or the default. You cannot use this to query the status of an existing index.
++ *
++ * @see #setUseCompoundFile(boolean)
++ */
++ public boolean getUseCompoundFile();
++
++ /**
++ * Expert: Set the Similarity implementation used by this IndexWriter.
++ *
++ * @see Similarity#setDefault(Similarity)
++ */
++ public void setSimilarity(Similarity similarity);
++
++ /**
++ * Expert: Return the Similarity implementation used by this IndexWriter.
++ *
++ * <p>This defaults to the current value of {@link Similarity#getDefault()}.
++ */
++ public Similarity getSimilarity();
++
++ /**
++ * Expert: Set the interval between indexed terms. Large values cause less
++ * memory to be used by IndexReader, but slow random-access to terms. Small
++ * values cause more memory to be used by an IndexReader, and speed
++ * random-access to terms.
++ *
++ * This parameter determines the amount of computation required per query
++ * term, regardless of the number of documents that contain that term. In
++ * particular, it is the maximum number of other terms that must be
++ * scanned before a term is located and its frequency and position information
++ * may be processed. In a large index with user-entered query terms, query
++ * processing time is likely to be dominated not by term lookup but rather
++ * by the processing of frequency and positional data. In a small index
++ * or when many uncommon query terms are generated (e.g., by wildcard
++ * queries) term lookup may become a dominant cost.
++ *
++ * In particular, <code>numUniqueTerms/interval</code> terms are read into
++ * memory by an IndexReader, and, on average, <code>interval/2</code> terms
++ * must be scanned for each random term access.
++ *
++ * @see #DEFAULT_TERM_INDEX_INTERVAL
++ */
++ public void setTermIndexInterval(int interval);
++
++ /**
++ * Expert: Return the interval between indexed terms.
++ *
++ * @see #setTermIndexInterval(int)
++ */
++ public int getTermIndexInterval();
++
++ /**
++ * Determines the minimal number of documents required before the buffered
++ * in-memory documents are merging and a new Segment is created.
++ * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
++ * large value gives faster indexing. At the same time, mergeFactor limits
++ * the number of files open in a FSDirectory.
++ *
++ * <p> The default value is 10.
++ *
++ *
++ * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
++ */
++ public void setMaxBufferedDocs(int maxBufferedDocs);
++
++ /**
++ *
++ *
++ * @see #setMaxBufferedDocs
++ */
++ public int getMaxBufferedDocs();
++
++ /**
++ * The maximum number of terms that will be indexed for a single field in a
++ * document. This limits the amount of memory required for indexing, so that
++ * collections with very large files will not crash the indexing process by
++ * running out of memory.<p/>
++ * Note that this effectively truncates large documents, excluding from the
++ * index terms that occur further in the document. If you know your source
++ * documents are large, be sure to set this value high enough to accomodate
++ * the expected size. If you set it to Integer.MAX_VALUE, then the only limit
++ * is your memory, but you should anticipate an OutOfMemoryError.<p/>
++ * By default, no more than 10,000 terms will be indexed for a field.
++ */
++ public void setMaxFieldLength(int maxFieldLength);
++
++ /**
++ *
++ *
++ * @see #setMaxFieldLength
++ */
++ public int getMaxFieldLength();
++
++ /**
++ * Determines the largest number of documents ever merged by addDocument().
++ * Small values (e.g., less than 10,000) are best for interactive indexing,
++ * as this limits the length of pauses while indexing to a few seconds.
++ * Larger values are best for batched indexing and speedier searches.
++ *
++ * <p>The default value is {@link Integer#MAX_VALUE}.
++ */
++ public void setMaxMergeDocs(int maxMergeDocs);
++
++ /**
++ *
++ *
++ * @see #setMaxMergeDocs
++ */
++ public int getMaxMergeDocs();
++
++ /**
++ * Determines how often segment indices are merged by addDocument(). With
++ * smaller values, less RAM is used while indexing, and searches on
++ * unoptimized indices are faster, but indexing speed is slower. With larger
++ * values, more RAM is used during indexing, and while searches on unoptimized
++ * indices are slower, indexing is faster. Thus larger values (> 10) are best
++ * for batch index creation, and smaller values (< 10) for indices that are
++ * interactively maintained.
++ *
++ * <p>This must never be less than 2. The default value is 10.
++ */
++ public void setMergeFactor(int mergeFactor);
++
++ /**
++ *
++ *
++ * @see #setMergeFactor
++ */
++ public int getMergeFactor();
++
++ /**
++ * Sets the maximum time to wait for a write lock (in milliseconds).
++ */
++ public void setWriteLockTimeout(long writeLockTimeout);
++
++ /**
++ *
++ *
++ * @see #setWriteLockTimeout
++ */
++ public long getWriteLockTimeout();
++
++ /**
++ * Sets the maximum time to wait for a commit lock (in milliseconds).
++ */
++ public void setCommitLockTimeout(long commitLockTimeout);
++
++ /**
++ *
++ *
++ * @see #setCommitLockTimeout
++ */
++ public long getCommitLockTimeout();
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @param selector the term vector TokenSelector
++ */
++ public void setTermVectorTokenSelector(TokenSelector selector);
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @return the TokenSelector used to determine term vector tokens
++ */
++ public TokenSelector getTermVectorTokenSelector();
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored.
++ * (At least one position is always stored for each term in each doc to ensure the term stays in
++ * the index so long as any docs reference it)
++ * @param selector the positions TokenSelector
++ */
++ public void setPositionsTokenSelector(TokenSelector selector);
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
++ * (At least one position is always stored for each term in each doc to ensure the term stays in
++ * the index so long as any docs reference it)
++ * @return the positions TokenSelector
++ */
++ public TokenSelector getPositionsTokenSelector();
++
++ /** If non-null, information about merges and a message when
++ * maxFieldLength is reached will be printed to this.
++ */
++ public void setInfoStream(PrintStream infoStream);
++
++ /**
++ *
++ *
++ * @see #setInfoStream
++ */
++ public PrintStream getInfoStream();
++
++}
+Index: src/java/org/apache/lucene/index/DocumentWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/DocumentWriter.java (revision 414705)
++++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy)
+@@ -17,6 +17,7 @@
+ */
+
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.document.Document;
+@@ -35,6 +36,8 @@
+
+ final class DocumentWriter {
+ private Analyzer analyzer;
++ private TokenSelector termVectorTokenSelector;
++ private TokenSelector positionsTokenSelector;
+ private Directory directory;
+ private Similarity similarity;
+ private FieldInfos fieldInfos;
+@@ -142,9 +145,9 @@
+ if (!field.isTokenized()) { // un-tokenized field
+ String stringValue = field.stringValue();
+ if(field.isStoreOffsetWithTermVector())
+- addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
++ addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
+ else
+- addPosition(fieldName, stringValue, position++, null);
++ addPosition(fieldName, stringValue, position++, null, false, false);
+ offset += stringValue.length();
+ length++;
+ } else
+@@ -165,10 +168,16 @@
+ for (Token t = stream.next(); t != null; t = stream.next()) {
+ position += (t.getPositionIncrement() - 1);
+
+- if(field.isStoreOffsetWithTermVector())
+- addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
+- else
+- addPosition(fieldName, t.termText(), position++, null);
++ boolean omittv = false, omitpos = false;
++ if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
++ omittv = true;
++ if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
++ omitpos = true;
++
++ addPosition(fieldName, t.termText(), position++,
++ field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
++ : null,
++ omittv, omitpos);
+
+ lastToken = t;
+ if (++length > maxFieldLength) {
+@@ -196,20 +205,24 @@
+
+ private final Term termBuffer = new Term("", ""); // avoid consing
+
+- private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
++ private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
++ boolean omitFromTermVector, boolean omitPosition) {
+ termBuffer.set(field, text);
+ //System.out.println("Offset: " + offset);
+ Posting ti = (Posting) postingTable.get(termBuffer);
+ if (ti != null) { // word seen before
+ int freq = ti.freq;
+- if (ti.positions.length == freq) { // positions array is full
+- int[] newPositions = new int[freq * 2]; // double size
+- int[] positions = ti.positions;
+- for (int i = 0; i < freq; i++) // copy old positions to new
+- newPositions[i] = positions[i];
+- ti.positions = newPositions;
++
++ if (!omitPosition) {
++ if (ti.positions.length == freq) { // positions array is full
++ int[] newPositions = new int[freq * 2]; // double size
++ int[] positions = ti.positions;
++ for (int i = 0; i < freq; i++) // copy old positions to new
++ newPositions[i] = positions[i];
++ ti.positions = newPositions;
++ }
++ ti.positions[freq] = position; // add new position
+ }
+- ti.positions[freq] = position; // add new position
+
+ if (offset != null) {
+ if (ti.offsets.length == freq){
+@@ -223,10 +236,12 @@
+ }
+ ti.offsets[freq] = offset;
+ }
+- ti.freq = freq + 1; // update frequency
+- } else { // word not seen before
++
++ if (!omitPosition)
++ ti.freq = freq + 1; // update frequency
++ } else { // word not seen before
+ Term term = new Term(field, text, false);
+- postingTable.put(term, new Posting(term, position, offset));
++ postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
+ }
+ }
+
+@@ -351,7 +366,7 @@
+ termVectorWriter.closeField();
+ }
+ }
+- if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
++ if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
+ termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
+ }
+ }
+@@ -390,6 +405,16 @@
+ this.infoStream = infoStream;
+ }
+
++ /** If non-null, this will be used to select which tokens are stored in term vectors */
++ void setTermVectorTokenSelector(TokenSelector selector) {
++ this.termVectorTokenSelector = selector;
++ }
++
++ /** If non-null, this will be used to select which tokens have positions stored in the index. */
++ void setPositionsTokenSelector(TokenSelector selector) {
++ this.positionsTokenSelector = selector;
++ }
++
+ }
+
+ final class Posting { // info about a Term in a doc
+@@ -397,17 +422,17 @@
+ int freq; // its frequency in doc
+ int[] positions; // positions it occurs at
+ TermVectorOffsetInfo [] offsets;
++ boolean omitFromTermVector; // if true, omit from term vector
+
+- Posting(Term t, int position, TermVectorOffsetInfo offset) {
++ Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
+ term = t;
+ freq = 1;
+ positions = new int[1];
+ positions[0] = position;
+- if(offset != null){
+- offsets = new TermVectorOffsetInfo[1];
+- offsets[0] = offset;
++ if(offset != null) {
++ offsets = new TermVectorOffsetInfo[1];
++ offsets[0] = offset;
+ }
+- else
+- offsets = null;
++ this.omitFromTermVector = omitFromTermVector;
+ }
+ }
+Index: src/java/org/apache/lucene/index/ParallelWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/ParallelWriter.java (revision 0)
++++ src/java/org/apache/lucene/index/ParallelWriter.java (revision 0)
+@@ -0,0 +1,345 @@
++/*
++ * ParallelWriter.java
++ *
++ * Created on April 28, 2006, 7:07 PM
++ *
++ */
++
++package org.apache.lucene.index;
++
++import java.io.IOException;
++import java.io.PrintStream;
++import java.util.Enumeration;
++import java.util.HashMap;
++import java.util.List;
++import java.util.Map;
++import java.util.concurrent.CountDownLatch;
++import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
++import org.apache.lucene.document.Document;
++import org.apache.lucene.document.Field;
++import org.apache.lucene.search.Similarity;
++import org.apache.lucene.store.Directory;
++
++/**
++ * ParallelWriter is a companion to ParallelReader, although as with IndexWriter it only supports indexes stored in a Directory.
++ * The interface is at the field level. A map from directories to lists of fields is provided to create the ParallelWriter,
++ * which then creates an IndexWriter for each specified directory and operates on each field of a document using the IndexWriter
++ * for the directory to which that field is mapped. This mapping allows an application to configure its use of parallel sub-
++ * indexes independently from the rest of its processing.
++ *
++ * This implementation single-threads calls to addDocument(), but does the sub-document writes in parallel. Users of this class
++ * must ensure that the ParallelReader is never reopened while adding a new document, and must deal with recovery if exceptions
++ * occur while adding a document.
++ *
++ * @author Chuck Williams
++ */
++public class ParallelWriter implements Writable {
++
++ IndexWriter[] writers; // All IndexWriters
++ IOException exception; // If any writer gets an exception, this is stored here (only one needed)
++ Map<String,IndexWriter> writerMap; // Field name --> IndexWriter that stores that field
++ IndexWriter oneWriter; // An arbitrarily chosen IndexWriter -- used to get config info which is the same for all IndexWriters
++ Analyzer analyzer; // The Analyzer applied to all tokenized field content
++
++ private static final Document EMPTY_DOCUMENT = new Document(); // Empty document used to sync doc id's when a document is added without fields for all indexes
++
++ /**
++ * Create a new ParallelWriter
++ *
++ * @param directoryFieldsMap specifies the directory to use to store each field, multiple directories creating parallel indexes
++ * @param analyzer applied to all tokenized field content
++ * @param create create new indexes in directories iff true
++ * @throws IOException if the IndexWriters cannot be created
++ */
++ public ParallelWriter(Map<Directory,List<String>> directoryFieldsMap, Analyzer analyzer, boolean create) throws IOException {
++ this.analyzer = analyzer;
++ writers = new IndexWriter[directoryFieldsMap.size()];
++ writerMap = new HashMap<String,IndexWriter>(directoryFieldsMap.size()*5/3);
++ int i=0;
++ for (Map.Entry<Directory,List<String>> entry : directoryFieldsMap.entrySet()) {
++ IndexWriter writer = new IndexWriter(entry.getKey(), analyzer, create);
++ writers[i++] = oneWriter = writer;
++ for (String field : entry.getValue())
++ writerMap.put(field, writer);
++ }
++ }
++
++ /** Invert a directoryFieldsMap
++ * @param directoryFieldsMap a map for directories to lists of fields they contain
++ * @return a map from each field to its directory
++ */
++ public static Map<String, Directory> invertDirectoryFieldsMap(Map<Directory,List<String>> directoryFieldsMap) {
++ Map<String, Directory> fieldDirectoryMap = new HashMap<String, Directory>();
++ for (Map.Entry<Directory, List<String>> entry : directoryFieldsMap.entrySet())
++ for (String field : entry.getValue())
++ fieldDirectoryMap.put(field, entry.getKey());
++ return fieldDirectoryMap;
++ }
++
++ /** Add document to this index by adding subdocuments with the mapped fields for each parallel index. This method is synchronized because the
++ * the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document.
++ * This synchronization could have a negative effect on batch indexing performance. Users of this method must ensure that the ParllelReader
++ * is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync.
++ * @param doc the document to add
++ * @throws IOException if there are problems writing the indexes. <strong>WARNING: If this happens it is bad.</string> The doc-id's in the
++ * indexes are likely out of sync. This situation requires repair to resync the doc ids in each document set. Possible
++ * repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then
++ * optimizing to restore equal doc ids.
++ * @throws RuntimeException if the threads writing to the sub-indexes are interrupted.
++ */
++ public void addDocument(Document doc) throws IOException {
++ addDocument(doc, analyzer);
++ }
++
++ /** Add document to this index by adding subdocuments with the mapped fields for each parallel index. This method is synchronized because the
++ * the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document.
++ * This synchronization could have a negative effect on batch indexing performance. Users of this method must ensure that the ParllelReader
++ * is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync.
++ * @param doc the document to add
++ * @param analyzer apply special analyzer to this document rather than the one for the index (discouraged -- use addDocument(doc))
++ * @throws IOException if there are problems writing the indexes. <strong>WARNING: If this happens it is bad.</string> The doc-id's in the
++ * indexes are likely out of sync. This situation requires repair to resync the doc ids in each document set. Possible
++ * repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then
++ * optimizing to restore equal doc ids.
++ * @throws RuntimeException if the threads writing to the sub-indexes are interrupted.
++ */
++ public synchronized void addDocument(Document doc, Analyzer analyzer) throws IOException {
++ Map<IndexWriter,Document> documentMap = new HashMap<IndexWriter,Document>(writers.length*5/3);
++ Enumeration<Field> fields = doc.fields();
++ while (fields.hasMoreElements()) {
++ Field field = fields.nextElement();
++ IndexWriter writer = writerMap.get(field.name());
++ if (writer==null)
++ throw new RuntimeException(new UnknownFieldException("Unregistered field: " + field.name()));
++ Document subdoc = documentMap.get(writer);
++ if (subdoc==null)
++ documentMap.put(writer, subdoc = new Document());
++ subdoc.add(field);
++ }
++ CountDownLatch latch = new CountDownLatch(writers.length);
++ exception = null;
++ for (IndexWriter writer : writers) {
++ Document subdoc = documentMap.get(writer);
++ if (subdoc==null) // Must have a document in each parallel index to sync doc id's
++ subdoc = EMPTY_DOCUMENT;
++ new Thread(new WriterWorker(writer, subdoc, latch)).run();
++ }
++ try {
++ latch.await();
++ } catch (InterruptedException e) {
++ throw new RuntimeException("Interrupted while writing subdocuments!", e);
++ }
++ if (exception != null)
++ throw exception;
++ }
++
++ // Write a sub-documents to a sub-index and record any exception
++ private class WriterWorker implements Runnable {
++
++ private IndexWriter writer;
++ private Document document;
++ private CountDownLatch latch;
++
++ private WriterWorker(IndexWriter writer, Document document, CountDownLatch latch) {
++ this.writer = writer;
++ this.document = document;
++ this.latch = latch;
++ }
++
++ public void run() {
++ try {
++ writer.addDocument(document);
++ } catch (IOException e) {
++ exception = e;
++ } finally {
++ latch.countDown();
++ }
++ }
++
++ }
++
++ /** Obtain the number of document in this index, which is the same for each parallel index. */
++ public int docCount() {
++ return oneWriter.docCount();
++ }
++
++ /** Optimize all parallel indexes. This is synchronized to keep all index doc-id's synced up */
++ public synchronized void optimize() throws IOException {
++ for (IndexWriter writer : writers)
++ writer.optimize();
++ }
++
++ /** Close all parallel indexes. Note that the provided directories are not closed. Synchronized. */
++ public synchronized void close() throws IOException {
++ for (IndexWriter writer : writers)
++ writer.close();
++ }
++
++ /** Getter for analyzer provided to the constructor */
++ public Analyzer getAnalyzer() {
++ return analyzer;
++ }
++
++ /** Set whether or not to use compound file format in every parallel index */
++ public void setUseCompoundFile(boolean value) {
++ for (IndexWriter writer : writers)
++ writer.setUseCompoundFile(value);
++ }
++
++ /** Get the compound file usage decision, same for every parallel index */
++ public boolean getUseCompoundFile() {
++ return oneWriter.getUseCompoundFile();
++ }
++
++ /** Set similarity to use for every parallel index */
++ public void setSimilarity(Similarity similarity) {
++ for (IndexWriter writer : writers)
++ writer.setSimilarity(similarity);
++ }
++
++ /** Get similarity, which is used by every parallel index */
++ public Similarity getSimilarity() {
++ return oneWriter.getSimilarity();
++ }
++
++ /** Set the termIndexInterval used for every parallel index */
++ public void setTermIndexInterval(int interval) {
++ for (IndexWriter writer : writers)
++ writer.setTermIndexInterval(interval);
++ }
++
++ /** Get the termIndexInterval, which is used by every parallel index */
++ public int getTermIndexInterval() {
++ return oneWriter.getTermIndexInterval();
++ }
++
++ /** Set maxBufferedDocs for every parallel index */
++ public void setMaxBufferedDocs(int maxBufferedDocs) {
++ for (IndexWriter writer : writers)
++ writer.setMaxBufferedDocs(maxBufferedDocs);
++ }
++
++ /** get maxBufferedDocs, same for every parallel index */
++ public int getMaxBufferedDocs() {
++ return oneWriter.getMaxBufferedDocs();
++ }
++
++ /** Set maxFieldLength to use for every parallel index */
++ public void setMaxFieldLength(int maxFieldLength) {
++ for (IndexWriter writer : writers)
++ writer.setMaxFieldLength(maxFieldLength);
++ }
++
++ /** Get maxFieldLength, same for every parallel index */
++ public int getMaxFieldLength() {
++ return oneWriter.getMaxFieldLength();
++ }
++
++ /** Set maxMergeDocs for every parallel index */
++ public void setMaxMergeDocs(int maxMergeDocs) {
++ for (IndexWriter writer : writers)
++ writer.setMaxMergeDocs(maxMergeDocs);
++ }
++
++ /** Get max merge docs, same for every parallel index */
++ public int getMaxMergeDocs() {
++ return oneWriter.getMaxMergeDocs();
++ }
++
++ /** Set merge factor for every parallel index */
++ public void setMergeFactor(int mergeFactor) {
++ for (IndexWriter writer : writers)
++ writer.setMergeFactor(mergeFactor);
++ }
++
++ /** Get merge factor, same for every parallel index */
++ public int getMergeFactor() {
++ return oneWriter.getMergeFactor();
++ }
++
++ /** Set write lock timeout (millis) for every parallel index */
++ public void setWriteLockTimeout(long writeLockTimeout) {
++ for (IndexWriter writer : writers)
++ writer.setWriteLockTimeout(writeLockTimeout);
++ }
++
++ /** Get write lock timeout, same for every parallel index */
++ public long getWriteLockTimeout() {
++ return oneWriter.getWriteLockTimeout();
++ }
++
++ /** Set commit lock timeout for every parallel index */
++ public void setCommitLockTimeout(long commitLockTimeout) {
++ for (IndexWriter writer : writers)
++ writer.setCommitLockTimeout(commitLockTimeout);
++ }
++
++ /** Get commit lock timeout, same for every parallel index */
++ public long getCommitLockTimeout() {
++ return oneWriter.getCommitLockTimeout();
++ }
++
++ /** Get term vector TokenSelector, same for every parallel index */
++ public void setTermVectorTokenSelector(TokenSelector selector) {
++ for (IndexWriter writer : writers)
++ writer.setTermVectorTokenSelector(selector);
++ }
++
++ /** Set term vector TokenSelector for every parallel index */
++ public TokenSelector getTermVectorTokenSelector() {
++ return oneWriter.getTermVectorTokenSelector();
++ }
++
++ /** Set positions TokenSelector for every parallel index */
++ public void setPositionsTokenSelector(TokenSelector selector) {
++ for (IndexWriter writer : writers)
++ writer.setPositionsTokenSelector(selector);
++ }
++
++ /** Get positions TokenSelector, same for every parallel index */
++ public TokenSelector getPositionsTokenSelector() {
++ return oneWriter.getPositionsTokenSelector();
++ }
++
++ /** Unsupported. use setInfoStream(field, infoStream) */
++ public void setInfoStream(PrintStream infoStream) {
++ throw new UnsupportedOperationException();
++ }
++
++ /** Set an info stream for the the IndexWriter managing a specified field. The info stream receives information about field truncations, merges, etc.
++ * @param field the field whose writer to assign the info stream to
++ * @param infoStream the info stream
++ * @throws UnknownFieldException if field has not been associated with an IndexWriter in this index
++ */
++ public void setInfoStream(String field, PrintStream infoStream) throws UnknownFieldException {
++ IndexWriter writer = writerMap.get(field);
++ if (writer==null)
++ throw new UnknownFieldException("Unregistered field: " + field);
++ writer.setInfoStream(infoStream);
++
++ }
++
++ /** Unsupported. use getInfoStream(field) */
++ public PrintStream getInfoStream() {
++ throw new UnsupportedOperationException();
++ }
++
++ /** Getter for info stream associated with field. See setInfoStream(field, infoStream).
++ */
++ public PrintStream getInfoStream(String field, PrintStream infoStream) throws UnknownFieldException {
++ IndexWriter writer = writerMap.get(field);
++ if (writer==null)
++ throw new UnknownFieldException("Unregistered field: " + field);
++ return writer.getInfoStream();
++ }
++
++ private static class UnknownFieldException extends Exception {
++
++ private UnknownFieldException(String message) {
++ super(message);
++ }
++
++ }
++
++}
diff --git a/attachments/LUCENE-602/TokenSelectorSoloAll.patch b/attachments/LUCENE-602/TokenSelectorSoloAll.patch
new file mode 100644
index 0000000..2818b50
--- /dev/null
+++ b/attachments/LUCENE-602/TokenSelectorSoloAll.patch
@@ -0,0 +1,374 @@
+Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
+===================================================================
+--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 414705)
++++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
+@@ -16,11 +16,15 @@
+ * limitations under the License.
+ */
+
++import java.util.LinkedList;
++import java.util.List;
+ import junit.framework.TestCase;
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.WhitespaceAnalyzer;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.analysis.WhitespaceTokenizer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.*;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.RAMDirectory;
+@@ -54,6 +58,16 @@
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ Similarity similarity = Similarity.getDefault();
+ DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
++ writer.setTermVectorTokenSelector(new TokenSelector(){
++ public boolean accept(String field, Token t) {
++ return Character.isLowerCase(t.termText().charAt(0));
++ }
++ });
++ writer.setPositionsTokenSelector(new TokenSelector(){
++ public boolean accept(String field, Token t) {
++ return Character.isLowerCase(t.termText().charAt(0));
++ }
++ });
+ String segName = "test";
+ writer.addDocument(segName, testDoc);
+ //After adding the document, we should be able to read it back in
+@@ -84,6 +98,31 @@
+ fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
++
++ fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
++ assertTrue(fields != null && fields.length == 1);
++ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
++ assertTrue(fields[0].isTermVectorStored());
++ TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
++ assertTrue(tv != null);
++ String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
++ String[] tvwords = tv.getTerms();
++ List uniques = new LinkedList();
++ int omitted = 0;
++ for (int i=0; i<words.length; i++)
++ if (!uniques.contains(words[i])) {
++ uniques.add(words[i]);
++ if (!Character.isLowerCase(words[i].charAt(0)))
++ omitted++;
++ }
++ assertTrue(omitted!=0);
++ assertTrue(omitted!=uniques.size());
++ assertEquals(uniques.size()-omitted, tvwords.length);
++ for (int i=0; i<uniques.size(); i++) {
++ for (int j=0; j<tvwords.length; j++)
++ if (uniques.get(i).equals(tvwords[j]))
++ assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
++ }
+
+ // test that the norm file is not present if omitNorms is true
+ for (int i = 0; i < reader.fieldInfos.size(); i++) {
+Index: src/java/org/apache/lucene/analysis/TokenSelector.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
++++ src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
+@@ -0,0 +1,24 @@
++/*
++ * TokenSelector.java
++ *
++ * Created on June 13, 2006, 12:18 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++/**
++ * An interface for selecting a subset of a token stream
++ *
++ * @author Chuck Wiliams
++ */
++public interface TokenSelector {
++
++ /** Determine if a token should be selected
++ * @param fieldName field in which token was found
++ * @param token a token
++ * @return true iff token should be selected
++ */
++ public boolean accept(String fieldName, Token token);
++
++}
+Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
+===================================================================
+--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
++++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
+@@ -0,0 +1,44 @@
++/*
++ * PerFieldTokenSelectorWrapper.java
++ *
++ * Created on June 13, 2006, 4:09 PM
++ *
++ */
++
++package org.apache.lucene.analysis;
++
++import java.util.HashMap;
++import java.util.Map;
++
++/**
++ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
++ *
++ * @author Chuck Williams
++ */
++public class PerFieldTokenSelectorWrapper implements TokenSelector {
++
++ private Map selectors = new HashMap();
++ private TokenSelector defaultSelector;
++
++ /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
++ public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
++ this.defaultSelector = defaultSelector;
++ }
++
++ /** Add a token selector for the named field */
++ public void addSelector(String fieldName, TokenSelector selector) {
++ selectors.put(fieldName, selector);
++ }
++
++ /** Determine if token is accepted by fieldName */
++ public boolean accept(String fieldName, Token token) {
++ TokenSelector selector = (TokenSelector) selectors.get(fieldName);
++ if (selector!=null)
++ return selector.accept(fieldName, token);
++ else if (defaultSelector!=null)
++ return defaultSelector.accept(fieldName, token);
++ else
++ return true;
++ }
++
++}
+\ No newline at end of file
+Index: src/java/org/apache/lucene/index/IndexWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/IndexWriter.java (revision 414705)
++++ src/java/org/apache/lucene/index/IndexWriter.java (working copy)
+@@ -17,6 +17,7 @@
+ */
+
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.document.Document;
+ import org.apache.lucene.search.Similarity;
+ import org.apache.lucene.store.Directory;
+@@ -100,8 +101,10 @@
+ */
+ public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
+
+- private Directory directory; // where this index resides
+- private Analyzer analyzer; // how to analyze text
++ private Directory directory; // where this index resides
++ private Analyzer analyzer; // how to analyze text
++ private TokenSelector termVectorTokenSelector; // subset of token stream stored in term vectors
++ private TokenSelector positionsTokenSelector; // subset of token stream for which positions are stored
+
+ private Similarity similarity = Similarity.getDefault(); // how to normalize
+
+@@ -153,6 +156,38 @@
+ return this.similarity;
+ }
+
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @param selector the term vector TokenSelector
++ */
++ public void setTermVectorTokenSelector(TokenSelector selector) {
++ this.termVectorTokenSelector = selector;
++ }
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
++ * @return the TokenSelector used to determine term vector tokens
++ */
++ public TokenSelector getTermVectorTokenSelector() {
++ return termVectorTokenSelector;
++ }
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored.
++ * (At least one position is always stored for each term in each doc to ensure the term stays in
++ * the index so long as any docs reference it)
++ * @param selector the positions TokenSelector
++ */
++ public void setPositionsTokenSelector(TokenSelector selector) {
++ this.positionsTokenSelector = selector;
++ }
++
++ /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
++ * (At least one position is always stored for each term in each doc to ensure the term stays in
++ * the index so long as any docs reference it)
++ * @return the positions TokenSelector
++ */
++ public TokenSelector getPositionsTokenSelector() {
++ return positionsTokenSelector;
++ }
++
+ /** Expert: Set the interval between indexed terms. Large values cause less
+ * memory to be used by IndexReader, but slow random-access to terms. Small
+ * values cause more memory to be used by an IndexReader, and speed
+@@ -471,6 +506,8 @@
+ public void addDocument(Document doc, Analyzer analyzer) throws IOException {
+ DocumentWriter dw =
+ new DocumentWriter(ramDirectory, analyzer, this);
++ dw.setTermVectorTokenSelector(termVectorTokenSelector);
++ dw.setPositionsTokenSelector(positionsTokenSelector);
+ dw.setInfoStream(infoStream);
+ String segmentName = newSegmentName();
+ dw.addDocument(segmentName, doc);
+Index: src/java/org/apache/lucene/index/DocumentWriter.java
+===================================================================
+--- src/java/org/apache/lucene/index/DocumentWriter.java (revision 414705)
++++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy)
+@@ -17,6 +17,7 @@
+ */
+
+ import org.apache.lucene.analysis.Analyzer;
++import org.apache.lucene.analysis.TokenSelector;
+ import org.apache.lucene.analysis.Token;
+ import org.apache.lucene.analysis.TokenStream;
+ import org.apache.lucene.document.Document;
+@@ -35,6 +36,8 @@
+
+ final class DocumentWriter {
+ private Analyzer analyzer;
++ private TokenSelector termVectorTokenSelector;
++ private TokenSelector positionsTokenSelector;
+ private Directory directory;
+ private Similarity similarity;
+ private FieldInfos fieldInfos;
+@@ -142,9 +145,9 @@
+ if (!field.isTokenized()) { // un-tokenized field
+ String stringValue = field.stringValue();
+ if(field.isStoreOffsetWithTermVector())
+- addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
++ addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
+ else
+- addPosition(fieldName, stringValue, position++, null);
++ addPosition(fieldName, stringValue, position++, null, false, false);
+ offset += stringValue.length();
+ length++;
+ } else
+@@ -165,10 +168,16 @@
+ for (Token t = stream.next(); t != null; t = stream.next()) {
+ position += (t.getPositionIncrement() - 1);
+
+- if(field.isStoreOffsetWithTermVector())
+- addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
+- else
+- addPosition(fieldName, t.termText(), position++, null);
++ boolean omittv = false, omitpos = false;
++ if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
++ omittv = true;
++ if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
++ omitpos = true;
++
++ addPosition(fieldName, t.termText(), position++,
++ field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
++ : null,
++ omittv, omitpos);
+
+ lastToken = t;
+ if (++length > maxFieldLength) {
+@@ -196,20 +205,24 @@
+
+ private final Term termBuffer = new Term("", ""); // avoid consing
+
+- private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
++ private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
++ boolean omitFromTermVector, boolean omitPosition) {
+ termBuffer.set(field, text);
+ //System.out.println("Offset: " + offset);
+ Posting ti = (Posting) postingTable.get(termBuffer);
+ if (ti != null) { // word seen before
+ int freq = ti.freq;
+- if (ti.positions.length == freq) { // positions array is full
+- int[] newPositions = new int[freq * 2]; // double size
+- int[] positions = ti.positions;
+- for (int i = 0; i < freq; i++) // copy old positions to new
+- newPositions[i] = positions[i];
+- ti.positions = newPositions;
++
++ if (!omitPosition) {
++ if (ti.positions.length == freq) { // positions array is full
++ int[] newPositions = new int[freq * 2]; // double size
++ int[] positions = ti.positions;
++ for (int i = 0; i < freq; i++) // copy old positions to new
++ newPositions[i] = positions[i];
++ ti.positions = newPositions;
++ }
++ ti.positions[freq] = position; // add new position
+ }
+- ti.positions[freq] = position; // add new position
+
+ if (offset != null) {
+ if (ti.offsets.length == freq){
+@@ -223,10 +236,12 @@
+ }
+ ti.offsets[freq] = offset;
+ }
+- ti.freq = freq + 1; // update frequency
+- } else { // word not seen before
++
++ if (!omitPosition)
++ ti.freq = freq + 1; // update frequency
++ } else { // word not seen before
+ Term term = new Term(field, text, false);
+- postingTable.put(term, new Posting(term, position, offset));
++ postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
+ }
+ }
+
+@@ -351,7 +366,7 @@
+ termVectorWriter.closeField();
+ }
+ }
+- if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
++ if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
+ termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
+ }
+ }
+@@ -390,6 +405,16 @@
+ this.infoStream = infoStream;
+ }
+
++ /** If non-null, this will be used to select which tokens are stored in term vectors */
++ void setTermVectorTokenSelector(TokenSelector selector) {
++ this.termVectorTokenSelector = selector;
++ }
++
++ /** If non-null, this will be used to select which tokens have positions stored in the index. */
++ void setPositionsTokenSelector(TokenSelector selector) {
++ this.positionsTokenSelector = selector;
++ }
++
+ }
+
+ final class Posting { // info about a Term in a doc
+@@ -397,17 +422,17 @@
+ int freq; // its frequency in doc
+ int[] positions; // positions it occurs at
+ TermVectorOffsetInfo [] offsets;
++ boolean omitFromTermVector; // if true, omit from term vector
+
+- Posting(Term t, int position, TermVectorOffsetInfo offset) {
++ Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
+ term = t;
+ freq = 1;
+ positions = new int[1];
+ positions[0] = position;
+- if(offset != null){
+- offsets = new TermVectorOffsetInfo[1];
+- offsets[0] = offset;
++ if(offset != null) {
++ offsets = new TermVectorOffsetInfo[1];
++ offsets[0] = offset;
+ }
+- else
+- offsets = null;
++ this.omitFromTermVector = omitFromTermVector;
+ }
+ }