You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2002/09/16 05:19:19 UTC

cvs commit: jakarta-lucene/src/test/org/apache/lucene/analysis/ru TestRussianAnalyzer.java TestRussianStem.java res1251.htm resKOI8.htm resUnicode.htm stemsUnicode.txt test1251.txt testKOI8.txt testUnicode.txt wordsUnicode.txt

otis        2002/09/15 20:19:19

  Added:       src/test/org/apache/lucene/analysis/ru
                        TestRussianAnalyzer.java TestRussianStem.java
                        res1251.htm resKOI8.htm resUnicode.htm
                        stemsUnicode.txt test1251.txt testKOI8.txt
                        testUnicode.txt wordsUnicode.txt
  Log:
  - Russian Analyzer unit tests and unit test data.
  
  Revision  Changes    Path
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
  
  Index: TestRussianAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import junit.framework.TestCase;
  import java.io.BufferedReader;
  import java.io.BufferedWriter;
  import java.io.FileReader;
  import java.io.FileWriter;
  
  import java.io.InputStreamReader;
  import java.io.OutputStreamWriter;
  import java.io.FileInputStream;
  import java.io.FileOutputStream;
  import java.io.IOException;
  import java.io.UnsupportedEncodingException;
  import java.util.ArrayList;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  
  /**
   * Test case for RussianAnalyzer.
   *
   * @author    Boris Okner
   * @version   $Id: TestRussianAnalyzer.java,v 1.1 2002/09/16 03:19:19 otis Exp $
   */
  
  public class TestRussianAnalyzer extends TestCase
  {
      private InputStreamReader inWords;
  
      private InputStreamReader sampleUnicode;
  
      private FileReader inWordsKOI8;
  
      private FileReader sampleKOI8;
  
      private FileReader inWords1251;
  
      private FileReader sample1251;
  
      public TestRussianAnalyzer(String name)
      {
          super(name);
      }
  
      public static void main(String[] args)
      {
          junit.textui.TestRunner.run(RussianAnalyzerTest.class);
      }
  
      /**
       * @see TestCase#setUp()
       */
      protected void setUp() throws Exception
      {
          super.setUp();
  
      }
  
      /**
       * @see TestCase#tearDown()
       */
      protected void tearDown() throws Exception
      {
  
          super.tearDown();
      }
  
      public void testUnicode() throws IOException
      {
  
          RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
          inWords =
              new InputStreamReader(
                  new FileInputStream("src/test/org/apache/lucene/analysis/ru/testUnicode.txt"),
                  "Unicode");
  
          sampleUnicode =
              new InputStreamReader(
                  new FileInputStream("src/test/org/apache/lucene/analysis/ru/resUnicode.htm"),
                  "Unicode");
  
          TokenStream in = ra.tokenStream("all", inWords);
  
          RussianLetterTokenizer sample =
              new RussianLetterTokenizer(
                  sampleUnicode,
                  RussianCharsets.UnicodeRussian);
  
          for (;;)
          {
              Token token = in.next();
  
              if (token == null)
              {
                  break;
              }
  
              Token sampleToken = sample.next();
              assertEquals(
                  "Unicode",
                  token.termText(),
                  sampleToken == null
                  ? null
                  : sampleToken.termText());
          }
  
          inWords.close();
          sampleUnicode.close();
      }
  
      public void testKOI8() throws IOException
      {
          //System.out.println(new java.util.Date());
          RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
          // KOI8
          inWordsKOI8 = new FileReader("src/test/org/apache/lucene/analysis/ru/testKOI8.txt");
  
          sampleKOI8 = new FileReader("src/test/org/apache/lucene/analysis/ru/resKOI8.htm");
  
          TokenStream in = ra.tokenStream("all", inWordsKOI8);
          RussianLetterTokenizer sample =
              new RussianLetterTokenizer(
                  sampleKOI8,
                  RussianCharsets.KOI8);
  
          for (;;)
          {
              Token token = in.next();
  
              if (token == null)
              {
                  break;
              }
  
              Token sampleToken = sample.next();
              assertEquals(
                  "KOI8",
                  token.termText(),
                  sampleToken == null
                  ? null
                  : sampleToken.termText());
  
          }
  
          inWordsKOI8.close();
          sampleKOI8.close();
      }
  
      public void test1251() throws IOException
      {
          // 1251
          inWords1251 = new FileReader("src/test/org/apache/lucene/analysis/ru/test1251.txt");
  
          sample1251 = new FileReader("src/test/org/apache/lucene/analysis/ru/res1251.htm");
  
          RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
          TokenStream in = ra.tokenStream("", inWords1251);
          RussianLetterTokenizer sample =
              new RussianLetterTokenizer(
                  sample1251,
                  RussianCharsets.CP1251);
  
          for (;;)
          {
              Token token = in.next();
  
              if (token == null)
              {
                  break;
              }
  
              Token sampleToken = sample.next();
              assertEquals(
                  "1251",
                  token.termText(),
                  sampleToken == null
                  ? null
                  : sampleToken.termText());
  
          }
  
          inWords1251.close();
          sample1251.close();
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
  
  Index: TestRussianStem.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import junit.framework.TestCase;
  import java.io.BufferedReader;
  import java.io.InputStreamReader;
  import java.io.FileInputStream;
  import java.io.IOException;
  import java.io.UnsupportedEncodingException;
  import java.util.ArrayList;
  
  public class TestRussianStem extends TestCase
  {
      private ArrayList words = new ArrayList();
      private ArrayList stems = new ArrayList();
  
      public TestRussianStem(String name)
      {
          super(name);
      }
  
      public static void main(String[] args)
      {
          junit.textui.TestRunner.run(RussianStemTest.class);
      }
  
      /**
       * @see TestCase#setUp()
       */
      protected void setUp() throws Exception
      {
          super.setUp();
          //System.out.println(new java.util.Date());
          String str;
  
          // open and read words into an array list
          BufferedReader inWords =
              new BufferedReader(
                  new InputStreamReader(
                      new FileInputStream("src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt"),
                      "Unicode"));
          while ((str = inWords.readLine()) != null)
          {
              words.add(str);
          }
          inWords.close();
  
          // open and read stems into an array list
          BufferedReader inStems =
              new BufferedReader(
                  new InputStreamReader(
                      new FileInputStream("src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt"),
                      "Unicode"));
          while ((str = inStems.readLine()) != null)
          {
              stems.add(str);
          }
          inStems.close();
  
      }
  
      /**
       * @see TestCase#tearDown()
       */
      protected void tearDown() throws Exception
      {
          super.tearDown();
      }
  
      public void testStem()
      {
          for (int i = 0; i < words.size(); i++)
          {
              //if ( (i % 100) == 0 ) System.err.println(i);
              String realStem =
                  RussianStemmer.stem(
                      (String) words.get(i),
                      RussianCharsets.UnicodeRussian);
              assertEquals("unicode", stems.get(i), realStem);
          }
      }
  
      private String printChars(String output)
      {
          StringBuffer s = new StringBuffer();
          for (int i = 0; i < output.length(); i++)
              {
              s.append(output.charAt(i));
          }
          return s.toString();
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/res1251.htm
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/resUnicode.htm
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/test1251.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/testUnicode.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt
  
  	<<Binary file>>
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>