You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/12/09 20:22:34 UTC

svn commit: r1044098 - in /lucene/dev/trunk/lucene: ./ src/test/org/apache/lucene/index/ src/test/org/apache/lucene/util/

Author: mikemccand
Date: Thu Dec  9 19:22:33 2010
New Revision: 1044098

URL: http://svn.apache.org/viewvc?rev=1044098&view=rev
Log:
LUCENE-2768: add line docs file so tests can easily run for longer on real data; add threaded NRT stress test

Added:
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java   (with props)
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LineFileDocs.java   (with props)
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz   (with props)
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py   (with props)
Modified:
    lucene/dev/trunk/lucene/common-build.xml
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java

Modified: lucene/dev/trunk/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/common-build.xml?rev=1044098&r1=1044097&r2=1044098&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/common-build.xml (original)
+++ lucene/dev/trunk/lucene/common-build.xml Thu Dec  9 19:22:33 2010
@@ -68,6 +68,7 @@
   <property name="tests.locale" value="random" />
   <property name="tests.timezone" value="random" />
   <property name="tests.directory" value="random" />
+  <property name="tests.linedocsfile" value="europarl.lines.txt.gz" />
   <property name="tests.iter" value="1" />
   <property name="tests.seed" value="random" />
   <property name="tests.userdir" value="."/>
@@ -459,6 +460,8 @@
 	      <sysproperty key="tests.timezone" value="${tests.timezone}"/>
               <!-- set the directory tests should run with -->
               <sysproperty key="tests.directory" value="${tests.directory}"/>
+              <!-- set the line file source for oal.util.LineFileDocs -->
+              <sysproperty key="tests.linedocsfile" value="${tests.linedocsfile}"/>
               <!-- set the number of times tests should run -->
               <sysproperty key="tests.iter" value="${tests.iter}"/>
               <!-- set the test seed -->

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java?rev=1044098&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java Thu Dec  9 19:22:33 2010
@@ -0,0 +1,336 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static org.junit.Assume.*;
+
+// TODO
+//   - mix in optimize, addIndexes
+
+public class TestNRTThreads extends LuceneTestCase {
+
+  @Test
+  public void testNRTThreads() throws Exception {
+
+    final long t0 = System.currentTimeMillis();
+
+    if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
+      // no
+      CodecProvider.getDefault().setDefaultFieldCodec("Standard");
+    }
+
+    final LineFileDocs docs = new LineFileDocs(true);
+    final File tempDir = _TestUtil.getTempDir("nrtopenfiles");
+    final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir));
+    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer());
+    conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
+      @Override
+      public void warm(IndexReader reader) throws IOException {
+        if (VERBOSE) {
+          System.out.println("TEST: now warm merged reader=" + reader);
+        }
+        final int maxDoc = reader.maxDoc();
+        final Bits delDocs = reader.getDeletedDocs();
+        int sum = 0;
+        final int inc = Math.max(1, maxDoc/50);
+        for(int docID=0;docID<maxDoc;docID += inc) {
+          if (delDocs == null || !delDocs.get(docID)) {
+            final Document doc = reader.document(docID);
+            sum += doc.getFields().size();
+          }
+        }
+
+        sum += new IndexSearcher(reader).search(new TermQuery(new Term("body", "united")), 10).totalHits;
+
+        if (VERBOSE) {
+          System.out.println("TEST: warm visited " + sum + " fields");
+        }
+      }
+      });
+
+    final IndexWriter writer = new IndexWriter(dir, conf);
+    if (VERBOSE) {
+      writer.setInfoStream(System.out);
+    }
+    MergeScheduler ms = writer.getConfig().getMergeScheduler();
+    if (ms instanceof ConcurrentMergeScheduler) {
+      // try to keep max file open count down
+      ((ConcurrentMergeScheduler) ms).setMaxThreadCount(1);
+      ((ConcurrentMergeScheduler) ms).setMaxMergeCount(1);
+    }
+    LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy();
+    if (lmp.getMergeFactor() > 5) {
+      lmp.setMergeFactor(5);
+    }
+
+    final int NUM_INDEX_THREADS = 2;
+    final int NUM_SEARCH_THREADS = 3;
+    final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : 5;
+
+    final AtomicBoolean failed = new AtomicBoolean();
+    final AtomicInteger addCount = new AtomicInteger();
+    final AtomicInteger delCount = new AtomicInteger();
+    final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000;
+    Thread[] threads = new Thread[NUM_INDEX_THREADS];
+    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
+      threads[thread] = new Thread() {
+          @Override
+          public void run() {
+            final List<String> toDeleteIDs = new ArrayList<String>();
+            while(System.currentTimeMillis() < stopTime && !failed.get()) {
+              try {
+                Document doc = docs.nextDoc();
+                if (doc == null) {
+                  break;
+                }
+                if (random.nextBoolean()) {
+                  if (VERBOSE) {
+                    //System.out.println(Thread.currentThread().getName() + ": add doc id:" + doc.get("id"));
+                  }
+                  writer.addDocument(doc);
+                } else {
+                  // we use update but it never replaces a
+                  // prior doc
+                  if (VERBOSE) {
+                    //System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("id"));
+                  }
+                  writer.updateDocument(new Term("id", doc.get("id")), doc);
+                }
+                if (random.nextInt(5) == 3) {
+                  if (VERBOSE) {
+                    //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("id"));
+                  }
+                  toDeleteIDs.add(doc.get("id"));
+                }
+                if (random.nextInt(50) == 17) {
+                  if (VERBOSE) {
+                    System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes");
+                  }
+                  for(String id : toDeleteIDs) {
+                    writer.deleteDocuments(new Term("id", id));
+                  }
+                  delCount.addAndGet(toDeleteIDs.size());
+                  toDeleteIDs.clear();
+                }
+                addCount.getAndIncrement();
+              } catch (Exception exc) {
+                System.out.println(Thread.currentThread().getName() + ": hit exc");
+                exc.printStackTrace();
+                failed.set(true);
+                throw new RuntimeException(exc);
+              }
+            }
+          }
+        };
+      threads[thread].setDaemon(true);
+      threads[thread].start();
+    }
+
+    if (VERBOSE) {
+      System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]");
+    }
+
+    // let index build up a bit
+    Thread.sleep(100);
+
+    IndexReader r = IndexReader.open(writer);
+    boolean any = false;
+
+    // silly starting guess:
+    final AtomicInteger totTermCount = new AtomicInteger(100);
+
+    while(System.currentTimeMillis() < stopTime && !failed.get()) {
+      if (random.nextBoolean()) {
+        if (VERBOSE) {
+          System.out.println("TEST: now reopen r=" + r);
+        }
+        final IndexReader r2 = r.reopen();
+        if (r != r2) {
+          r.close();
+          r = r2;
+        }
+      } else {
+        if (VERBOSE) {
+          System.out.println("TEST: now close reader=" + r);
+        }
+        r.close();
+        writer.commit();
+        final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
+        if (openDeletedFiles.size() > 0) {
+          System.out.println("OBD files: " + openDeletedFiles);
+        }
+        any |= openDeletedFiles.size() > 0;
+        //assertEquals("open but deleted: " + openDeletedFiles, 0, openDeletedFiles.size());
+        if (VERBOSE) {
+          System.out.println("TEST: now open");
+        }
+        r = IndexReader.open(writer);
+      }
+      if (VERBOSE) {
+        System.out.println("TEST: got new reader=" + r);
+      }
+      //System.out.println("numDocs=" + r.numDocs() + "
+      //openDelFileCount=" + dir.openDeleteFileCount());
+
+      smokeTestReader(r);
+
+      final IndexSearcher s = new IndexSearcher(r);
+
+      // run search threads
+      final long searchStopTime = System.currentTimeMillis() + 500;
+      final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS];
+      final AtomicInteger totHits = new AtomicInteger();
+      for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
+        searchThreads[thread] = new Thread() {
+          @Override
+          public void run() {
+            try {
+              TermsEnum termsEnum = MultiFields.getTerms(s.getIndexReader(), "body").iterator();
+              int seenTermCount = 0;
+              int shift;
+              int trigger;
+              if (totTermCount.get() == 0) {
+                shift = 0;
+                trigger = 1;
+              } else {
+                shift = random.nextInt(totTermCount.get()/10);
+                trigger = totTermCount.get()/10;
+              }
+              while(System.currentTimeMillis() < searchStopTime) {
+                BytesRef term = termsEnum.next();
+                if (term == null) {
+                  totTermCount.set(seenTermCount);
+                  seenTermCount = 0;
+                  trigger = totTermCount.get()/10;
+                  //System.out.println("trigger " + trigger);
+                  shift = random.nextInt(totTermCount.get()/10);
+                  termsEnum.seek(new BytesRef(""));
+                  continue;
+                }
+                seenTermCount++;
+                // search 10 terms
+                if (trigger == 0) {
+                  trigger = 1;
+                }
+                if ((seenTermCount + shift) % trigger == 0) {
+                  //if (VERBOSE) {
+                  //System.out.println(Thread.currentThread().getName() + " now search body:" + term.utf8ToString());
+                  //}
+                  totHits.addAndGet(runQuery(s, new TermQuery(new Term("body", term))));
+                }
+              }
+              if (VERBOSE) {
+                System.out.println(Thread.currentThread().getName() + ": search done");
+              }
+            } catch (Throwable t) {
+              failed.set(true);
+              t.printStackTrace(System.out);
+              throw new RuntimeException(t);
+            }
+          }
+          };
+        searchThreads[thread].setDaemon(true);
+        searchThreads[thread].start();
+      }
+
+      for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
+        searchThreads[thread].join();
+      }
+
+      if (VERBOSE) {
+        System.out.println("TEST: DONE search: totHits=" + totHits);
+      }
+    }
+
+    if (VERBOSE) {
+      System.out.println("TEST: all searching done [" + (System.currentTimeMillis()-t0) + " ms]");
+    }
+
+    //System.out.println("numDocs=" + r.numDocs() + " openDelFileCount=" + dir.openDeleteFileCount());
+    r.close();
+    final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
+    if (openDeletedFiles.size() > 0) {
+      System.out.println("OBD files: " + openDeletedFiles);
+    }
+    any |= openDeletedFiles.size() > 0;
+
+    assertFalse("saw non-zero open-but-deleted count", any);
+    if (VERBOSE) {
+      System.out.println("TEST: now join");
+    }
+    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
+      threads[thread].join();
+    }
+    if (VERBOSE) {
+      System.out.println("TEST: done join [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
+    }
+    writer.commit();
+    assertEquals(addCount.get() - delCount.get(), writer.numDocs());
+      
+    writer.close(false);
+    dir.close();
+    _TestUtil.rmDir(tempDir);
+    docs.close();
+    if (VERBOSE) {
+      System.out.println("TEST: done [" + (System.currentTimeMillis()-t0) + " ms]");
+    }
+  }
+
+  private int runQuery(IndexSearcher s, Query q) throws Exception {
+    s.search(q, 10);
+    return s.search(q, null, 10, new Sort(new SortField("title", SortField.STRING))).totalHits;
+  }
+
+  private void smokeTestReader(IndexReader r) throws Exception {
+    IndexSearcher s = new IndexSearcher(r);
+    runQuery(s, new TermQuery(new Term("body", "united")));
+    runQuery(s, new TermQuery(new Term("titleTokenized", "states")));
+    PhraseQuery pq = new PhraseQuery();
+    pq.add(new Term("body", "united"));
+    pq.add(new Term("body", "states"));
+    runQuery(s, pq);
+    s.close();
+  }
+}

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LineFileDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LineFileDocs.java?rev=1044098&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LineFileDocs.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LineFileDocs.java Thu Dec  9 19:22:33 2010
@@ -0,0 +1,154 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.InputStream;
+import java.io.BufferedInputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+// Minimal port of contrib/benchmark's LneDocSource +
+// DocMaker, so tests can enum docs from a line file created
+// by contrib/benchmark's WriteLineDoc task
+public class LineFileDocs implements Closeable {
+
+  private BufferedReader reader;
+  private final boolean forever;
+  private final static int BUFFER_SIZE = 1 << 16;     // 64K
+  private final AtomicInteger id = new AtomicInteger();
+  private final String path;
+
+  // If forever is true, we rewind the file at EOF (repeat
+  // the docs over and over)
+  public LineFileDocs(String path, boolean forever) throws IOException {
+    this.path = path;
+    this.forever = forever;
+    open();
+  }
+
+  public LineFileDocs(boolean forever) throws IOException {
+    this(LuceneTestCase.TEST_LINE_DOCS_FILE, forever);
+  }
+
+  public synchronized void close() throws IOException {
+    if (reader != null) {
+      reader.close();
+      reader = null;
+    }
+  }
+
+  private synchronized void open() throws IOException {
+    InputStream is = getClass().getResourceAsStream(path);
+    if (is == null) {
+      throw new FileNotFoundException("cannot find line docs resource \"" + path + "\"");
+    }
+    if (path.toString().endsWith(".gz")) {
+      is = new GZIPInputStream(is);
+    }
+    final InputStream in = new BufferedInputStream(is, BUFFER_SIZE);
+    reader = new BufferedReader(new InputStreamReader(in, "UTF-8"), BUFFER_SIZE);
+  }
+
+  public synchronized void reset() throws IOException {
+    close();
+    open();
+    id.set(0);
+  }
+
+  private final static char SEP = '\t';
+
+  private static final class DocState {
+    final Document doc;
+    final Field titleTokenized;
+    final Field title;
+    final Field body;
+    final Field id;
+    final Field date;
+
+    public DocState() {
+      doc = new Document();
+      
+      title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
+      doc.add(title);
+
+      titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+      doc.add(titleTokenized);
+
+      body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+      doc.add(body);
+
+      id = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+      doc.add(id);
+
+      date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+      doc.add(date);
+    }
+  }
+
+  private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();
+
+  // Document instance is re-used per-thread
+  public Document nextDoc() throws IOException {
+    String line;
+    synchronized(this) {
+      line = reader.readLine();
+      if (line == null) {
+        if (forever) {
+          if (LuceneTestCase.VERBOSE) {
+            System.out.println("TEST: LineFileDocs: now rewind file...");
+          }
+          close();
+          open();
+          line = reader.readLine();
+        }
+        return null;
+      }
+    }
+
+    DocState docState = threadDocs.get();
+    if (docState == null) {
+      docState = new DocState();
+      threadDocs.set(docState);
+    }
+
+    int spot = line.indexOf(SEP);
+    if (spot == -1) {
+      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+    }
+    int spot2 = line.indexOf(SEP, 1 + spot);
+    if (spot2 == -1) {
+      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+    }
+
+    docState.body.setValue(line.substring(1+spot2, line.length()));
+    final String title = line.substring(0, spot);
+    docState.title.setValue(title);
+    docState.titleTokenized.setValue(title);
+    docState.date.setValue(line.substring(1+spot, spot2));
+    docState.id.setValue(Integer.toString(id.getAndIncrement()));
+    return docState.doc;
+  }
+}

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java?rev=1044098&r1=1044097&r2=1044098&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec  9 19:22:33 2010
@@ -128,19 +128,21 @@ public abstract class LuceneTestCase ext
   // each test case (non-J4 tests) and each test class (J4
   // tests)
   /** Gets the codec to run tests with. */
-  static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField");
+  public static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField");
   /** Gets the locale to run tests with */
-  static final String TEST_LOCALE = System.getProperty("tests.locale", "random");
+  public static final String TEST_LOCALE = System.getProperty("tests.locale", "random");
   /** Gets the timezone to run tests with */
-  static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random");
+  public static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random");
   /** Gets the directory to run tests with */
-  static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
+  public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
   /** Get the number of times to run tests */
-  static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1"));
+  public static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1"));
   /** Get the random seed for tests */
-  static final String TEST_SEED = System.getProperty("tests.seed", "random");
+  public static final String TEST_SEED = System.getProperty("tests.seed", "random");
   /** whether or not nightly tests should run */
-  static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false"));
+  public static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false"));
+  /** the line file used by LineFileDocs */
+  public static final String TEST_LINE_DOCS_FILE = System.getProperty("tests.linedocsfile", "europarl.lines.txt.gz");
   
   private static final Pattern codecWithParam = Pattern.compile("(.*)\\(\\s*(\\d+)\\s*\\)");
 

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz?rev=1044098&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py?rev=1044098&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py Thu Dec  9 19:22:33 2010
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import glob
+import datetime
+import tarfile
+import re
+
+try:
+  sys.argv.remove('-verbose')
+  VERBOSE = True
+except ValueError:
+  VERBOSE = False
+
+try:
+  sys.argv.remove('-docPerParagraph')
+  docPerParagraph = True
+except ValueError:
+  docPerParagraph = False
+
+reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
+reTagOnly = re.compile('^<.*?>$')
+reNumberOnly = re.compile(r'^\d+\.?$')
+
+docCount = 0
+didEnglish = False
+
+def write(date, title, pending, fOut):
+  global docCount
+  body = ' '.join(pending).replace('\t', ' ').strip()
+  if len(body) > 0:
+    line = '%s\t%s\t%s\n' % (title, date, body)
+    fOut.write(line)
+    docCount += 1
+    del pending[:]
+    if VERBOSE:
+      print len(body)
+
+def processTar(fileName, fOut):
+
+  global didEnglish
+
+  t = tarfile.open(fileName, 'r:gz')
+  for ti in t:
+    if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
+
+      tup = ti.name.split('/')
+      lang = tup[1]
+      year = int(tup[2][3:5])
+      if year < 20:
+        year += 2000
+      else:
+        year += 1900
+
+      month = int(tup[2][6:8])
+      day = int(tup[2][9:11])
+      date = datetime.date(year=year, month=month, day=day)
+
+      if VERBOSE:
+        print
+        print '%s: %s' % (ti.name, date)
+      nextIsTitle = False
+      title = None
+      pending = []
+      for line in t.extractfile(ti).readlines():
+        line = line.strip()
+        if reChapterOnly.match(line) is not None:
+          if title is not None:
+            write(date, title, pending, fOut)
+          nextIsTitle = True
+          continue
+        if nextIsTitle:
+          if not reNumberOnly.match(line) and not reTagOnly.match(line):
+            title = line
+            nextIsTitle = False
+            if VERBOSE:
+              print '  title %s' % line
+          continue
+        if line.lower() == '<p>':
+          if docPerParagraph:
+            write(date, title, pending, fOut)
+          else:
+            pending.append('PARSEP')
+        elif not reTagOnly.match(line):
+          pending.append(line)
+      if title is not None and len(pending) > 0:
+        write(date, title, pending, fOut)
+
+  didEnglish = True
+  
+# '/x/lucene/data/europarl/all.lines.txt'
+dirIn = sys.argv[1]
+fileOut = sys.argv[2]
+  
+fOut = open(fileOut, 'wb')
+
+for fileName in glob.glob('%s/??-??.tgz' % dirIn):
+  if fileName.endswith('.tgz'):
+    print 'process %s; %d docs so far...' % (fileName, docCount)
+    processTar(fileName, fOut)
+
+print 'TOTAL: %s' % docCount
+
+#run something like this:
+"""
+
+# Europarl V5 makes 76,917 docs, avg 38.6 KB per
+python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
+shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
+rm /x/lucene/data/europarl/tmp.lines.txt
+
+# Run again, this time each paragraph is a doc:
+# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
+python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
+shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
+rm /x/lucene/data/europarl/tmp.lines.txt
+
+# ~5.5 MB gzip'd:
+head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
+head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
+shuf tmp.txt > europarl.subset.txt
+rm -f tmp.txt
+gzip --best europarl.subset.txt
+"""