You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2004/05/30 22:24:20 UTC
cvs commit: jakarta-lucene/src/test/org/apache/lucene/queryParser TestQueryParser.java
otis 2004/05/30 13:24:20
Modified: . CHANGES.txt build.xml
src/java/org/apache/lucene/analysis/de GermanStemmer.java
GermanAnalyzer.java
src/test/org/apache/lucene/queryParser TestQueryParser.java
Log:
- Switched to UTF-8 file encoding
Revision Changes Path
1.91 +5 -1 jakarta-lucene/CHANGES.txt
Index: CHANGES.txt
===================================================================
RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
retrieving revision 1.90
retrieving revision 1.91
diff -u -r1.90 -r1.91
--- CHANGES.txt 24 May 2004 19:05:21 -0000 1.90
+++ CHANGES.txt 30 May 2004 20:24:20 -0000 1.91
@@ -17,6 +17,10 @@
methods to replace a PhraseQuery with a SpanNearQuery instead,
keeping the proper slop factor. (Erik Hatcher)
+ 4. Changed the encoding of GermanAnalyzer.java and GermanStemmer.java to
+ UTF-8 and changed the build encoding to UTF-8, to make changed files
+ compile. (Otis Gospodnetic)
+
1.4 RC3
1.64 +1 -1 jakarta-lucene/build.xml
Index: build.xml
===================================================================
RCS file: /home/cvs/jakarta-lucene/build.xml,v
retrieving revision 1.63
retrieving revision 1.64
diff -u -r1.63 -r1.64
--- build.xml 11 May 2004 20:20:04 -0000 1.63
+++ build.xml 30 May 2004 20:24:20 -0000 1.64
@@ -23,7 +23,7 @@
<property name="javac.debug" value="on"/>
<property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? -->
<property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
- <property name="build.encoding" value="ISO-8859-1"/>
+ <property name="build.encoding" value="utf-8"/>
<property name="build.dir" location="build"/>
<property name="dist.dir" location="dist"/>
1.11 +12 -12 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
Index: GermanStemmer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- GermanStemmer.java 30 Mar 2004 15:54:48 -0000 1.10
+++ GermanStemmer.java 30 May 2004 20:24:20 -0000 1.11
@@ -18,7 +18,7 @@
/**
* A stemmer for German words. The algorithm is based on the report
- * "A Fast and Simple Stemming Algorithm for German Words" by J�rg
+ * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
* Caumanns (joerg.caumanns@isst.fhg.de).
*
* @author Gerhard Schwarz
@@ -153,12 +153,12 @@
/**
* Do some substitutions for the term to reduce overstemming:
*
- * - Substitute Umlauts with their corresponding vowel: ��� -> aou,
- * "�" is substituted by "ss"
+ * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+ * "ß" is substituted by "ss"
* - Substitute a second char of a pair of equal characters with
* an asterisk: ?? -> ?*
* - Substitute some common character combinations with a token:
- * sch/ch/ei/ie/ig/st -> $/�/%/&/#/!
+ * sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
*/
private void substitute( StringBuffer buffer )
{
@@ -169,18 +169,18 @@
buffer.setCharAt( c, '*' );
}
// Substitute Umlauts.
- else if ( buffer.charAt( c ) == '�' ) {
+ else if ( buffer.charAt( c ) == 'ä' ) {
buffer.setCharAt( c, 'a' );
}
- else if ( buffer.charAt( c ) == '�' ) {
+ else if ( buffer.charAt( c ) == 'ö' ) {
buffer.setCharAt( c, 'o' );
}
- else if ( buffer.charAt( c ) == '�' ) {
+ else if ( buffer.charAt( c ) == 'ü' ) {
buffer.setCharAt( c, 'u' );
}
// Take care that at least one character is left left side from the current one
if ( c < buffer.length() - 1 ) {
- if ( buffer.charAt( c ) == '�' ) {
+ if ( buffer.charAt( c ) == 'ß' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, 's' );
substCount++;
@@ -194,7 +194,7 @@
substCount =+ 2;
}
else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
- buffer.setCharAt( c, '�' );
+ buffer.setCharAt( c, '§' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
@@ -225,7 +225,7 @@
/**
* Undoes the changes made by substitute(). That are character pairs and
* character combinations. Umlauts will remain as their corresponding vowel,
- * as "�" remains as "ss".
+ * as "ß" remains as "ss".
*/
private void resubstitute( StringBuffer buffer )
{
@@ -238,7 +238,7 @@
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
}
- else if ( buffer.charAt( c ) == '�' ) {
+ else if ( buffer.charAt( c ) == '§' ) {
buffer.setCharAt( c, 'c' );
buffer.insert( c + 1, 'h' );
}
1.16 +3 -3 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
Index: GermanAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -r1.15 -r1.16
--- GermanAnalyzer.java 30 Mar 2004 15:44:58 -0000 1.15
+++ GermanAnalyzer.java 30 May 2004 20:24:20 -0000 1.16
@@ -47,14 +47,14 @@
*/
private String[] GERMAN_STOP_WORDS = {
"einer", "eine", "eines", "einem", "einen",
- "der", "die", "das", "dass", "da�",
+ "der", "die", "das", "dass", "daß",
"du", "er", "sie", "es",
"was", "wer", "wie", "wir",
"und", "oder", "ohne", "mit",
"am", "im", "in", "aus", "auf",
"ist", "sein", "war", "wird",
"ihr", "ihre", "ihres",
- "als", "f�r", "von", "mit",
+ "als", "für", "von", "mit",
"dich", "dir", "mich", "mir",
"mein", "sein", "kein",
"durch", "wegen", "wird"
1.26 +2 -2 jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java
Index: TestQueryParser.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java,v
retrieving revision 1.25
retrieving revision 1.26
diff -u -r1.25 -r1.26
--- TestQueryParser.java 3 Mar 2004 12:07:13 -0000 1.25
+++ TestQueryParser.java 30 May 2004 20:24:20 -0000 1.26
@@ -159,8 +159,8 @@
public void testSimple() throws Exception {
assertQueryEquals("term term term", null, "term term term");
- assertQueryEquals("t�rm term term", null, "t�rm term term");
- assertQueryEquals("�mlaut", null, "�mlaut");
+ assertQueryEquals("türm term term", null, "türm term term");
+ assertQueryEquals("ümlaut", null, "ümlaut");
assertQueryEquals("a AND b", null, "+a +b");
assertQueryEquals("(a AND b)", null, "+a +b");
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org