You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2004/05/30 22:24:20 UTC
cvs commit: jakarta-lucene/src/test/org/apache/lucene/queryParser TestQueryParser.java

otis        2004/05/30 13:24:20

  Modified:    .        CHANGES.txt build.xml
               src/java/org/apache/lucene/analysis/de GermanStemmer.java
                        GermanAnalyzer.java
               src/test/org/apache/lucene/queryParser TestQueryParser.java
  Log:
  - Switched to UTF-8 file encoding
  
  Revision  Changes    Path
  1.91      +5 -1      jakarta-lucene/CHANGES.txt
  
  Index: CHANGES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
  retrieving revision 1.90
  retrieving revision 1.91
  diff -u -r1.90 -r1.91
  --- CHANGES.txt	24 May 2004 19:05:21 -0000	1.90
  +++ CHANGES.txt	30 May 2004 20:24:20 -0000	1.91
  @@ -17,6 +17,10 @@
       methods to replace a PhraseQuery with a SpanNearQuery instead,
       keeping the proper slop factor. (Erik Hatcher)
   
  + 4. Changed the encoding of GermanAnalyzer.java and GermanStemmer.java to
  +    UTF-8 and changed the build encoding to UTF-8, to make changed files
  +    compile. (Otis Gospodnetic)
  +
   
   1.4 RC3
   
  
  
  
  1.64      +1 -1      jakarta-lucene/build.xml
  
  Index: build.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/build.xml,v
  retrieving revision 1.63
  retrieving revision 1.64
  diff -u -r1.63 -r1.64
  --- build.xml	11 May 2004 20:20:04 -0000	1.63
  +++ build.xml	30 May 2004 20:24:20 -0000	1.64
  @@ -23,7 +23,7 @@
     <property name="javac.debug" value="on"/>
     <property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? -->
     <property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
  -  <property name="build.encoding" value="ISO-8859-1"/>
  +  <property name="build.encoding" value="utf-8"/>
   
     <property name="build.dir" location="build"/>
     <property name="dist.dir" location="dist"/>
  
  
  
  1.11      +12 -12    jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- GermanStemmer.java	30 Mar 2004 15:54:48 -0000	1.10
  +++ GermanStemmer.java	30 May 2004 20:24:20 -0000	1.11
  @@ -18,7 +18,7 @@
   
   /**
    * A stemmer for German words. The algorithm is based on the report
  - * "A Fast and Simple Stemming Algorithm for German Words" by J�rg
  + * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
    * Caumanns (joerg.caumanns@isst.fhg.de).
    *
    * @author    Gerhard Schwarz
  @@ -153,12 +153,12 @@
       /**
        * Do some substitutions for the term to reduce overstemming:
        *
  -     * - Substitute Umlauts with their corresponding vowel: ��� -> aou,
  -     *   "�" is substituted by "ss"
  +     * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
  +     *   "ß" is substituted by "ss"
        * - Substitute a second char of a pair of equal characters with
        *   an asterisk: ?? -> ?*
        * - Substitute some common character combinations with a token:
  -     *   sch/ch/ei/ie/ig/st -> $/�/%/&/#/!
  +     *   sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
        */
       private void substitute( StringBuffer buffer )
       {
  @@ -169,18 +169,18 @@
             buffer.setCharAt( c, '*' );
           }
           // Substitute Umlauts.
  -        else if ( buffer.charAt( c ) == '�' ) {
  +        else if ( buffer.charAt( c ) == 'ä' ) {
             buffer.setCharAt( c, 'a' );
           }
  -        else if ( buffer.charAt( c ) == '�' ) {
  +        else if ( buffer.charAt( c ) == 'ö' ) {
             buffer.setCharAt( c, 'o' );
           }
  -        else if ( buffer.charAt( c ) == '�' ) {
  +        else if ( buffer.charAt( c ) == 'ü' ) {
             buffer.setCharAt( c, 'u' );
           }
           // Take care that at least one character is left left side from the current one
           if ( c < buffer.length() - 1 ) {
  -          if ( buffer.charAt( c ) == '�' ) {
  +          if ( buffer.charAt( c ) == 'ß' ) {
               buffer.setCharAt( c, 's' );
               buffer.insert( c + 1, 's' );
               substCount++;
  @@ -194,7 +194,7 @@
               substCount =+ 2;
             }
             else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
  -            buffer.setCharAt( c, '�' );
  +            buffer.setCharAt( c, '§' );
               buffer.deleteCharAt( c + 1 );
               substCount++;
             }
  @@ -225,7 +225,7 @@
       /**
        * Undoes the changes made by substitute(). That are character pairs and
        * character combinations. Umlauts will remain as their corresponding vowel,
  -     * as "�" remains as "ss".
  +     * as "ß" remains as "ss".
        */
       private void resubstitute( StringBuffer buffer )
       {
  @@ -238,7 +238,7 @@
             buffer.setCharAt( c, 's' );
             buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
           }
  -        else if ( buffer.charAt( c ) == '�' ) {
  +        else if ( buffer.charAt( c ) == '§' ) {
             buffer.setCharAt( c, 'c' );
             buffer.insert( c + 1, 'h' );
           }
  
  
  
  1.16      +3 -3      jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -r1.15 -r1.16
  --- GermanAnalyzer.java	30 Mar 2004 15:44:58 -0000	1.15
  +++ GermanAnalyzer.java	30 May 2004 20:24:20 -0000	1.16
  @@ -47,14 +47,14 @@
      */
     private String[] GERMAN_STOP_WORDS = {
       "einer", "eine", "eines", "einem", "einen",
  -    "der", "die", "das", "dass", "da�",
  +    "der", "die", "das", "dass", "daß",
       "du", "er", "sie", "es",
       "was", "wer", "wie", "wir",
       "und", "oder", "ohne", "mit",
       "am", "im", "in", "aus", "auf",
       "ist", "sein", "war", "wird",
       "ihr", "ihre", "ihres",
  -    "als", "f�r", "von", "mit",
  +    "als", "für", "von", "mit",
       "dich", "dir", "mich", "mir",
       "mein", "sein", "kein",
       "durch", "wegen", "wird"
  
  
  
  1.26      +2 -2      jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java
  
  Index: TestQueryParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java,v
  retrieving revision 1.25
  retrieving revision 1.26
  diff -u -r1.25 -r1.26
  --- TestQueryParser.java	3 Mar 2004 12:07:13 -0000	1.25
  +++ TestQueryParser.java	30 May 2004 20:24:20 -0000	1.26
  @@ -159,8 +159,8 @@
   
     public void testSimple() throws Exception {
       assertQueryEquals("term term term", null, "term term term");
  -    assertQueryEquals("t�rm term term", null, "t�rm term term");
  -    assertQueryEquals("�mlaut", null, "�mlaut");
  +    assertQueryEquals("türm term term", null, "türm term term");
  +    assertQueryEquals("ümlaut", null, "ümlaut");
   
       assertQueryEquals("a AND b", null, "+a +b");
       assertQueryEquals("(a AND b)", null, "+a +b");
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org